In [None]:
import pandas as pd

p = '/kaggle/input/word2vec-nlp-tutorial'
train_df = pd.read_csv(f"{p}/labeledTrainData.tsv.zip", header=0, delimiter="\t", quoting=3)
test_df = pd.read_csv(f"{p}/testData.tsv.zip", header=0, delimiter="\t", quoting=3 )
utrain_df = pd.read_csv(f"{p}/unlabeledTrainData.tsv.zip", header=0, delimiter="\t", quoting=3 )

TEST = False

## 1. Подготовка данных

In [None]:
import nltk.data
#nltk.download('punkt')

In [None]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

def to_clean_words(review, remove_stopwords):
    review = re.sub('\B\.+?\B', '', review)
    review = re.sub('https?:\/\/\S+|www\.\S+', '', review)
    review = BeautifulSoup(review).get_text()
    review = re.sub('\b\d+\b', 'NUM', review)
    review = re.sub('[^a-zA-Z]',' ', review)

    words = review.lower().split()

    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [w for w in words if not w in stops]

    return words

def to_clean_sentences(review, tr):
    raw_sentences = tr.tokenize(review.strip())

    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(to_clean_words(raw_sentence, False))

    return sentences

def do_progress_next(step, total):
    print(f'Processing {step:5} / {total:5}... ', end='\r')
    step += 1
    return step

def df_to_clean_sentences(df):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = []
    step = 1
    for review in df['review']:
        s = to_clean_sentences(review, tokenizer)
        sentences += s
        step = do_progress_next(step, df['review'].size)
    print()
    return sentences

def df_to_clean_words(df):
    cdf = []
    for r in df['review']:
        cdf.append(to_clean_words(r, False))
    return cdf

In [None]:
%%time
sentences = []
sentences += df_to_clean_sentences(train_df)
sentences += df_to_clean_sentences(utrain_df)
#sentences += df_to_clean_sentences(test_df)

## 2. Создание word2vec-модели

In [None]:
%%time
from gensim.models import Phrases
bigrams = Phrases(sentences=sentences)
trigrams = Phrases(sentences=bigrams[sentences])

In [None]:
%%time
import logging
from gensim.models import word2vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

vector_size = 300
min_word_count = 3
num_workers = 4
context = 10
downsampling = 1e-3

model_w2v = word2vec.Word2Vec(trigrams[bigrams[sentences]], workers=num_workers, \
            vector_size=vector_size, min_count=min_word_count, \
            window=context)

In [None]:
import numpy as np

w2v_dict = dict(zip(model_w2v.wv.index_to_key, model_w2v.wv.vectors))

def vectorize_text(texts):
    dim = len(next(iter(w2v_dict.values())))
    return np.array([
        np.mean([w2v_dict[w] for w in words if w in w2v_dict] 
                or [np.zeros(dim)], axis=0)
        for words in texts
    ])

## 3. Создание основной модели

In [None]:
def create_x(df):
    data = trigrams[bigrams[df_to_clean_words(df)]]
    return vectorize_text(data)

def create_y(df):
    return df['sentiment']

In [None]:
from sklearn.model_selection import train_test_split

x = create_x(train_df)
y = create_y(train_df)

if TEST:
    x_train, x_test, y_train, y_test = train_test_split(
        x,
        y,
        test_size=0.05,
        shuffle=True,
        random_state=42)
else:
    x_train = x
    y_train = y

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)
model.fit(x_train, y_train.values)

In [None]:
if TEST:
    model.score(x_test, y_test)

## 4. Обработка тестовой выборки

In [None]:
x = create_x(test_df)

In [None]:
import csv

res = model.predict(x)

output = pd.DataFrame(data={"id": test_df["id"], "sentiment": res})
output.to_csv('submission.csv', index=False, columns=['id','sentiment'], quoting=csv.QUOTE_NONE)