In [None]:
from google.colab import files

uploaded = files.upload()

In [4]:
import pandas as pd

data = pd.read_csv('train.tsv.zip', sep="\t")
data.describe()

Unnamed: 0,PhraseId,SentenceId,Sentiment
count,156060.0,156060.0,156060.0
mean,78030.5,4079.732744,2.063578
std,45050.785842,2502.764394,0.893832
min,1.0,1.0,0.0
25%,39015.75,1861.75,2.0
50%,78030.5,4017.0,2.0
75%,117045.25,6244.0,3.0
max,156060.0,8544.0,4.0


In [5]:
import os, sys, csv, re

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

from tqdm.auto import tqdm

# import en_core_web_md

# nlp = en_core_web_md.load()


data = data.rename(columns = {'Phrase' : 'Text', 'Sentiment' : 'Score'})
data_shuffle_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for i, (train_indices, test_indices) in enumerate(data_shuffle_split.split(data, data.Score)):
    data = data.iloc[train_indices].reset_index(drop = True)

data.head()

Unnamed: 0,PhraseId,SentenceId,Text,Score
0,32387,1518,really good,4
1,61509,3110,vistas,2
2,7990,327,and not worth,2
3,65074,3292,unstinting look,2
4,102946,5424,"A thoroughly enjoyable , heartfelt",4


In [6]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn

tokenizer = RegexpTokenizer(r"['\-\w\.\?!,]+")
lemmatizer = WordNetLemmatizer()
stops = stopwords.words('english')


def tockenize_(text):
    if type(text) == list:
        text = ' '.join(text)
    text = re.sub(r" (n't|'[a-z]{1,3})", r'\1', text)
    text = re.sub(r'[^a-z0-9\s\'\-\.\?!,\"]', '', text.lower())
    text = ' '.join([lemmatizer.lemmatize(w) for w in tokenizer.tokenize(text)])
    return text


tqdm.pandas()
print('train')
data['Tokens'] = data.Text.apply(tockenize_)
data['nb_words'] = data.Tokens.str.count(' ') + 1
data['rel_nb_words'] = data.nb_words / data.nb_words.max()
data['nb_chars'] = data.Tokens.str.len()
data['rel_nb_chars'] = data.nb_chars / data.nb_chars.max()


data.head()

  from pandas import Panel


train


Unnamed: 0,PhraseId,SentenceId,Text,Score,Tokens,nb_words,rel_nb_words,nb_chars,rel_nb_chars
0,32387,1518,really good,4,really good,2,0.038462,11,0.039427
1,61509,3110,vistas,2,vista,1,0.019231,5,0.017921
2,7990,327,and not worth,2,and not worth,3,0.057692,13,0.046595
3,65074,3292,unstinting look,2,unstinting look,2,0.038462,15,0.053763
4,102946,5424,"A thoroughly enjoyable , heartfelt",4,"a thoroughly enjoyable , heartfelt",5,0.096154,34,0.121864


In [7]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

oneword_sentiment = {}
df_len1 = data[data.nb_words == 1]
for label in df_len1.Score.value_counts().to_dict().keys():
    for word in df_len1.Tokens[df_len1.Score == label]:
        oneword_sentiment[word] = label


def vader_sentinet(word):
    vader_scores = sid.polarity_scores(word)
    v_neg = vader_scores['neg']
    v_neu = vader_scores['neu']
    v_pos = vader_scores['pos']
    
    senti_net = list(swn.senti_synsets(word))
    if len(senti_net) > 0:
        s_neg = senti_net[0].neg_score()
        s_pos = senti_net[0].pos_score()
        s_obj = senti_net[0].obj_score()
    else:
        s_neg, s_pos, s_obj = 0, 0, 0
    return v_neg, v_neu, v_pos, s_neg, s_pos, s_obj

def mean_senti_vader_score(text):
    if type(text) == str:
        text = text.split()
    score_lists = [[] for _ in range(6)]
    for w in text:
        scores = vader_sentinet(w)
        for i, s in enumerate(scores):
            score_lists[i].append(s)
    mean_scores = [sum(l) / max(1, len(l)) for l in score_lists]
    return mean_scores
        
def nb_senti_words(text):
    if type(text) == str:
        text = text.split()
    d = {-1 : 0, 0 : 0, 1 : 0, 2 : 0, 3 : 0, 4 : 0}
    for w in text:
        if w in oneword_sentiment:
            d[oneword_sentiment[w]] += 1
    for k in d:
        d[k] /= max(1, len(text))
        
    return d[0], d[1], d[2], d[3], d[4], d[-1]

print('train')
data = data.merge(data.Tokens.apply(lambda t: pd.Series(nb_senti_words(t))), 
                left_index=True, right_index=True)
data = data.rename(columns={
                        0 : "nb_senti_0", 
                        1 : "nb_senti_1", 
                        2 : "nb_senti_2", 
                        3 : "nb_senti_3",
                        4 : "nb_senti_4",
                        5 : "nb_senti_-1"
                       })

print('\tvader/sentinet')
data = data.merge(data.Tokens.apply(lambda t: pd.Series(mean_senti_vader_score(t))), 
                left_index=True, right_index=True)
data = data.rename(columns={
                        0 : "v_neg", 
                        1 : "v_neu", 
                        2 : "v_pos", 
                        3 : "s_neg",
                        4 : "s_pos",
                        5 : "s_obj"
                       })

data.head()



train
	vader/sentinet


Unnamed: 0,PhraseId,SentenceId,Text,Score,Tokens,nb_words,rel_nb_words,nb_chars,rel_nb_chars,nb_senti_0,nb_senti_1,nb_senti_2,nb_senti_3,nb_senti_4,nb_senti_-1,v_neg,v_neu,v_pos,s_neg,s_pos,s_obj
0,32387,1518,really good,4,really good,2,0.038462,11,0.039427,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.5,0.5,0.0,0.5625,0.4375
1,61509,3110,vistas,2,vista,1,0.019231,5,0.017921,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,7990,327,and not worth,2,and not worth,3,0.057692,13,0.046595,0.0,0.333333,0.666667,0.0,0.0,0.0,0.0,0.666667,0.333333,0.208333,0.0,0.458333
3,65074,3292,unstinting look,2,unstinting look,2,0.038462,15,0.053763,0.0,0.5,0.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,102946,5424,"A thoroughly enjoyable , heartfelt",4,"a thoroughly enjoyable , heartfelt",5,0.096154,34,0.121864,0.0,0.0,0.6,0.4,0.0,0.0,0.0,0.2,0.4,0.05,0.225,0.525


In [9]:
from nltk.corpus import subjectivity, movie_reviews
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC

print('subjectivity')
subj_docs = []
for l in tqdm(['subj', 'obj']):
    for s in subjectivity.sents(categories=l):
        subj_docs.append((l, tockenize_(s)))

print('polarity')
review_docs = []
for l in tqdm(['pos', 'neg']):
    for s in movie_reviews.sents(categories=l):
        review_docs.append((l, tockenize_(s)))

subj_df = pd.DataFrame(subj_docs, columns = ['label', 'text'])
review_df = pd.DataFrame(review_docs, columns = ['label', 'text'])

subj_cv = CountVectorizer(binary = True, min_df = 5, max_df = .5, dtype = np.int8).fit(subj_df.text)
review_cv = CountVectorizer(binary = True, min_df = 5, max_df = .5, dtype = np.int8).fit(review_df.text)

X_subj = subj_cv.transform(subj_df.text)
X_review = review_cv.transform(review_df.text)

print('subj_logit')
subj_logit = LogisticRegressionCV(n_jobs = 3, max_iter = 400, 
                                  cv=6, random_state=0, verbose = True).fit(X_subj, subj_df.label)
print('review_logit')
review_logit = LogisticRegressionCV(n_jobs = 3, max_iter = 400, 
                                  cv=6, random_state=0, verbose = True).fit(X_review, review_df.label)

print('subj', subj_logit.score(X_subj, subj_df.label))
print('polarity', review_logit.score(X_review, review_df.label))

data['subj_0'], data['subj_1'] = zip(*subj_logit.predict_proba(subj_cv.transform(data.Tokens)))
data['review_0'], data['review_1'] = zip(*review_logit.predict_proba(review_cv.transform(data.Tokens)))

del subj_df, subj_logit, subj_cv
del review_df, review_logit, review_cv

data.head()


subjectivity


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


polarity


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


subj_logit


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   6 out of   6 | elapsed:   11.1s finished


review_logit


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   6 out of   6 | elapsed:  2.1min finished


subj 0.9552
polarity 0.7647346642062294


Unnamed: 0,PhraseId,SentenceId,Text,Score,Tokens,nb_words,rel_nb_words,nb_chars,rel_nb_chars,nb_senti_0,nb_senti_1,nb_senti_2,nb_senti_3,nb_senti_4,nb_senti_-1,v_neg,v_neu,v_pos,s_neg,s_pos,s_obj,subj_0,subj_1,review_0,review_1
0,32387,1518,really good,4,really good,2,0.038462,11,0.039427,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.5,0.5,0.0,0.5625,0.4375,0.208254,0.791746,0.513218,0.486782
1,61509,3110,vistas,2,vista,1,0.019231,5,0.017921,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.490048,0.509952,0.555898,0.444102
2,7990,327,and not worth,2,and not worth,3,0.057692,13,0.046595,0.0,0.333333,0.666667,0.0,0.0,0.0,0.0,0.666667,0.333333,0.208333,0.0,0.458333,0.262924,0.737076,0.47352,0.52648
3,65074,3292,unstinting look,2,unstinting look,2,0.038462,15,0.053763,0.0,0.5,0.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.382419,0.617581,0.645582,0.354418
4,102946,5424,"A thoroughly enjoyable , heartfelt",4,"a thoroughly enjoyable , heartfelt",5,0.096154,34,0.121864,0.0,0.0,0.6,0.4,0.0,0.0,0.0,0.2,0.4,0.05,0.225,0.525,0.219345,0.780655,0.451776,0.548224


In [10]:
def vader_score(text):
    scores = sid.polarity_scores(text)
    return scores['neg'], scores['neu'], scores['pos'], scores['compound']

print('train')
data = data.merge(data.Tokens.apply(lambda t: pd.Series(vader_score(t))), 
         left_index=True, right_index=True)
data = data.rename(columns={0 : "vader_neg", 1 : "vader_neu", 2 : "vader_pos", 3 : "vader_compound"})

train


In [11]:
from scipy.sparse import vstack, hstack, csr_matrix
from scipy.sparse.linalg import svds

print('data')
cv = CountVectorizer(binary = True, max_df = .5, min_df = 5, 
                        dtype = np.int8).fit(data.Tokens)
#cv = TfidfVectorizer(max_df = .5, min_df = 5, dtype = np.float32).fit(df.Tokens)

X_text = cv.transform(data.Tokens) #.toarray()

print('starting shape:', X_text.shape)

data
starting shape: (124848, 12710)


In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder


tockenizer_app = Tokenizer()
tockenizer_app.fit_on_texts(data.Tokens)


X_seq_list = tockenizer_app.texts_to_sequences(data.Tokens)
seq_len = max(max(X_seq_list, key=len))

X_seq = pad_sequences(X_seq_list, maxlen=seq_len)


In [14]:

X_other = np.array(data[['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound', 
                       'v_neg', 'v_neu', 'v_pos', 's_neg', 's_pos', 's_obj',
                       'nb_words', 'nb_chars', 'rel_nb_words', 'rel_nb_chars',
                       'nb_senti_-1', 'nb_senti_0', 'nb_senti_1', 'nb_senti_2', 'nb_senti_3', 'nb_senti_4', 
                       'subj_0', 'subj_1', 'review_0']])

#X = hstack((X_text, csr_matrix(X_lda), csr_matrix(X_other))).tocsr()
#X_final = hstack((X_text_test, csr_matrix(X_lda_test), csr_matrix(X_other_test))).tocsr()

X = hstack((X_text, csr_matrix(X_other))).tocsr()


oh = OneHotEncoder().fit(data.Score.to_numpy().reshape(-1, 1))
y = oh.transform(data.Score.to_numpy().reshape(-1,1)).toarray()


print(X.shape, X_seq.shape)
print(y.shape)
del X_seq_list

(124848, 12733) (124848, 8351)
(124848, 5)


In [15]:
from collections import Counter

def undersample_Xy(X, y):
    y_argmax = y.argmax(axis = 1)
    y_counts = Counter(y_argmax)
    nb_to_sample = sorted(y_counts.values())[1]
    
    def undr(label):
        X_by_label = X[y_argmax == label]
        y_by_label = y[y_argmax == label]
        count_for_label = X_by_label.shape[0]
        sampled_indices = np.random.choice(np.arange(count_for_label), 
                                   size = min(nb_to_sample, count_for_label), replace = False)
        undersampled_X = X_by_label[sampled_indices]
        undersampled_y = y_by_label[sampled_indices]
        return undersampled_X, undersampled_y
    
    Xs, ys = zip(*[undr(label) for label in y_counts.keys()])
    return vstack(Xs), vstack(ys)

undersample_Xy(X, y)

(<35118x12733 sparse matrix of type '<class 'numpy.float64'>'
 	with 801279 stored elements in Compressed Sparse Row format>,
 <35118x5 sparse matrix of type '<class 'numpy.float64'>'
 	with 35118 stored elements in COOrdinate format>)