In [None]:
!pip install spacy
!python -m spacy download en_core_web_lg
!python -m spacy download en_core_web_sm

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

import spacy
import en_core_web_lg

from tqdm import tqdm
tqdm.pandas()

In [2]:
class Preprocess:
    
    def __init__(self):
        self.word2vec = en_core_web_lg.load()
        self.spacy_nlp = spacy.load('en_core_web_sm')
    
    def tokenize(self, df: pd.Series):
        """Returns pd.Series of word lists"""
        words = []
        for doc in df.values:
            doc = self.spacy_nlp(doc)
            tokens = [token.text for token in doc]
            words.append(tokens)
        return pd.Series(words)
    
    def lemmatization_and_stop_words(self, df: pd.Series):
        docs = list(self.spacy_nlp.pipe(X_train))
        data_clean = [[w.lemma_ for w in doc if (not w.is_stop and not w.is_punct and not w.like_num)] for doc in docs]
        df = pd.Series([' '.join(comment)  for comment in data_clean])
        return df
    
    def get_vec(self, x):
        """Word2vec"""
        doc = self.word2vec(x)
        vec = doc.vector
        return vec
    
    def sen_to_vec(self, sentence, vec_dim = 300):
        """Simple averaging for word vectors"""
        ans = np.zeros(vec_dim)
        for word in sentence:
            ans += self.get_vec(word)
        ans /= len(sentence)
        return ans
    
    def words_to_vec(self, df: pd.Series):
        words = self.tokenize(df)
        df['vec'] = pd.Series(words).progress_apply(self.sen_to_vec)
        df = df['vec'].to_numpy()
        df = df.reshape(-1, 1)
        df = np.concatenate(np.concatenate(df, axis = 0), axis = 0).reshape(-1, 300)
        return df
    

In [3]:
class Scorer:
    
    def __init__(self):
        self.pipe_count_bigrams = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1, 2))),
                            ('lr', LogisticRegression())])
        self.pipe_count_lr = Pipeline([('lr', LogisticRegression())])
        self.pipe_svc = Pipeline([('svc', LinearSVC(penalty='l1', C=0.55, fit_intercept=False, dual=False, tol=1e-10, max_iter=100000))])
        
    def data_in_sentences(self, df, labels):
        cross_score_bigrams = cross_val_score(
            self.pipe_count_bigrams, X_train, y_train, scoring='accuracy', cv=5, n_jobs=-1)
        print('(Count Bigram + Unigram Vectorizer + LR) mean score: ', cross_score_bigrams.mean())
        
    def data_in_vectors(self, df, labels):
        cross_score_lr = cross_val_score(
            self.pipe_count_lr, df, labels, scoring='accuracy', cv=5, n_jobs=-1)
        cross_score_svc = cross_val_score(
            self.pipe_svc, df, labels, scoring='accuracy', cv=5, n_jobs=-1)
        
        print('(Word2vec + LR) mean score: ', cross_score_lr.mean())
        print('(Word2vec + SVC) mean score: ', cross_score_svc.mean())


In [4]:
train_df = pd.read_csv('products_sentiment_train.tsv', sep='\t')
X_train, y_train = train_df['2 . take around 10,000 640x480 pictures .'], train_df['1']
preprocess = Preprocess()
scorer = Scorer()
scorer.data_in_sentences(X_train, y_train)  # without preprocessing

(Count Bigram + Unigram Vectorizer + LR) mean score:  0.7698934837092731


In [5]:
X_train_clean = preprocess.lemmatization_and_stop_words(X_train)
scorer.data_in_sentences(X_train_clean, y_train)  # just lemmatisation and removing stop words

(Count Bigram + Unigram Vectorizer + LR) mean score:  0.7698934837092731


In [6]:
X_train_word2vec = preprocess.words_to_vec(X_train)
scorer.data_in_vectors(X_train_word2vec, y_train)  # just word2vec

100%|██████████████████████████████████████████████████████████████████████████████| 1999/1999 [03:50<00:00,  8.66it/s]


(Word2vec + LR) mean score:  0.7918934837092731
(Word2vec + SVC) mean score:  0.780889724310777


In [None]:
X_train_word2vec_clean = preprocess.words_to_vec(X_train_clean)
scorer.data_in_vectors(X_train_word2vec_clean, y_train)  # word2vec with lemmatisation and removing stop words

In [7]:
test_df = pd.read_csv('products_sentiment_test.tsv', sep='\t')
X_test = test_df['text']
X_test_word2vec = preprocess.words_to_vec(X_test)

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [00:52<00:00,  9.59it/s]


In [8]:
clf_pipeline = Pipeline([('lr', LogisticRegression())])
clf_pipeline.fit(X_train_word2vec, y_train)
test_df['y'] = clf_pipeline.predict(X_test_word2vec)
test_df[['Id','y']].to_csv('sample_sabmission.csv', index = False)