In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
import csv
import os
import re
import sys
from collections import Counter

import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from nltk.corpus import movie_reviews
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords, subjectivity
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from scipy.sparse import csr_matrix, hstack, vstack
from scipy.sparse.linalg import svds
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from tqdm.auto import tqdm


def tockenize_(text):
    if type(text) == list:
        text = ' '.join(text)
    text = re.sub(r" (n't|'[a-z]{1,3})", r'\1', text)
    text = re.sub(r'[^a-z0-9\s\'\-\.\?!,\"]', '', text.lower())
    text = ' '.join([lemmatizer.lemmatize(w) for w in tokenizer.tokenize(text)])
    return text


def vader_sentinet(word):
    vader_scores = sid.polarity_scores(word)
    v_neg = vader_scores['neg']
    v_neu = vader_scores['neu']
    v_pos = vader_scores['pos']
    
    senti_net = list(swn.senti_synsets(word))
    if len(senti_net) > 0:
        s_neg = senti_net[0].neg_score()
        s_pos = senti_net[0].pos_score()
        s_obj = senti_net[0].obj_score()
    else:
        s_neg, s_pos, s_obj = 0, 0, 0
    return v_neg, v_neu, v_pos, s_neg, s_pos, s_obj


def mean_senti_vader_score(text):
    if type(text) == str:
        text = text.split()
    score_lists = [[] for _ in range(6)]
    for w in text:
        scores = vader_sentinet(w)
        for i, s in enumerate(scores):
            score_lists[i].append(s)
    mean_scores = [sum(l) / max(1, len(l)) for l in score_lists]
    return mean_scores


def nb_senti_words(text):
    if type(text) == str:
        text = text.split()
    d = {-1 : 0, 0 : 0, 1 : 0, 2 : 0, 3 : 0, 4 : 0}
    for w in text:
        if w in oneword_sentiment:
            d[oneword_sentiment[w]] += 1
    for k in d:
        d[k] /= max(1, len(text))
        
    return d[0], d[1], d[2], d[3], d[4], d[-1]


def vader_score(text):
    scores = sid.polarity_scores(text)
    return scores['neg'], scores['neu'], scores['pos'], scores['compound']


def undersample_Xy(X, y):
    y_argmax = y.argmax(axis = 1)
    y_counts = Counter(y_argmax)
    nb_to_sample = sorted(y_counts.values())[1]
    
    def undr(label):
        X_by_label = X[y_argmax == label]
        y_by_label = y[y_argmax == label]
        count_for_label = X_by_label.shape[0]
        sampled_indices = np.random.choice(np.arange(count_for_label), 
                                   size = min(nb_to_sample, count_for_label), replace = False)
        undersampled_X = X_by_label[sampled_indices]
        undersampled_y = y_by_label[sampled_indices]
        return undersampled_X, undersampled_y
    
    Xs, ys = zip(*[undr(label) for label in y_counts.keys()])
    return vstack(Xs), vstack(ys)


data = pd.read_csv('train.tsv.zip', sep="\t")
data.describe()

In [None]:
data = data.rename(columns = {'Phrase' : 'Text', 'Sentiment' : 'Score'})
data_shuffle_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for i, (train_indices, test_indices) in enumerate(data_shuffle_split.split(data, data.Score)):
    data = data.iloc[train_indices].reset_index(drop = True)

data.head()

In [None]:
tokenizer = RegexpTokenizer(r"['\-\w\.\?!,]+")
lemmatizer = WordNetLemmatizer()
stops = stopwords.words('english')
tqdm.pandas()
data['Tokens'] = data.Text.apply(tockenize_)
data['nb_words'] = data.Tokens.str.count(' ') + 1
data['rel_nb_words'] = data.nb_words / data.nb_words.max()
data['nb_chars'] = data.Tokens.str.len()
data['rel_nb_chars'] = data.nb_chars / data.nb_chars.max()

data.head()

In [None]:
sid = SentimentIntensityAnalyzer()

oneword_sentiment = {}
df_len1 = data[data.nb_words == 1]
for label in df_len1.Score.value_counts().to_dict().keys():
    for word in df_len1.Tokens[df_len1.Score == label]:
        oneword_sentiment[word] = label

data = data.merge(data.Tokens.apply(lambda t: pd.Series(nb_senti_words(t))), 
                left_index=True, right_index=True)
data = data.rename(columns={
                        0 : "nb_senti_0", 
                        1 : "nb_senti_1", 
                        2 : "nb_senti_2", 
                        3 : "nb_senti_3",
                        4 : "nb_senti_4",
                        5 : "nb_senti_-1"
                       })

print('\tvader/sentinet')
data = data.merge(data.Tokens.apply(lambda t: pd.Series(mean_senti_vader_score(t))), 
                left_index=True, right_index=True)
data = data.rename(columns={
                        0 : "v_neg", 
                        1 : "v_neu", 
                        2 : "v_pos", 
                        3 : "s_neg",
                        4 : "s_pos",
                        5 : "s_obj"
                       })

data.head()

In [None]:
subj_docs = []
for l in tqdm(['subj', 'obj']):
    for s in subjectivity.sents(categories=l):
        subj_docs.append((l, tockenize_(s)))

review_docs = []
for l in tqdm(['pos', 'neg']):
    for s in movie_reviews.sents(categories=l):
        review_docs.append((l, tockenize_(s)))

subj_df = pd.DataFrame(subj_docs, columns = ['label', 'text'])
review_df = pd.DataFrame(review_docs, columns = ['label', 'text'])

subj_cv = CountVectorizer(binary = True, min_df = 5, max_df = .5, dtype = np.int8).fit(subj_df.text)
review_cv = CountVectorizer(binary = True, min_df = 5, max_df = .5, dtype = np.int8).fit(review_df.text)

X_subj = subj_cv.transform(subj_df.text)
X_review = review_cv.transform(review_df.text)

subj_logit = LogisticRegressionCV(n_jobs = 3, max_iter = 400, 
                                  cv=6, random_state=0, verbose = True).fit(X_subj, subj_df.label)
review_logit = LogisticRegressionCV(n_jobs = 3, max_iter = 400, 
                                  cv=6, random_state=0, verbose = True).fit(X_review, review_df.label)

print('subj', subj_logit.score(X_subj, subj_df.label))
print('polarity', review_logit.score(X_review, review_df.label))

data['subj_0'], data['subj_1'] = zip(*subj_logit.predict_proba(subj_cv.transform(data.Tokens)))
data['review_0'], data['review_1'] = zip(*review_logit.predict_proba(review_cv.transform(data.Tokens)))

del subj_df, subj_logit, subj_cv
del review_df, review_logit, review_cv

data.head()

In [None]:
data = data.merge(data.Tokens.apply(lambda t: pd.Series(vader_score(t))), 
         left_index=True, right_index=True)
data = data.rename(columns={0 : "vader_neg", 1 : "vader_neu", 2 : "vader_pos", 3 : "vader_compound"})
cv = CountVectorizer(binary = True, max_df = .5, min_df = 5, 
                        dtype = np.int8).fit(data.Tokens)
X_text = cv.transform(data.Tokens) #.toarray()
print('starting shape:', X_text.shape)

tockenizer_app = Tokenizer()
tockenizer_app.fit_on_texts(data.Tokens)

X_seq_list = tockenizer_app.texts_to_sequences(data.Tokens)
seq_len = max(max(X_seq_list, key=len))
X_seq = pad_sequences(X_seq_list, maxlen=seq_len)

X_other = np.array(data[['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound', 
                       'v_neg', 'v_neu', 'v_pos', 's_neg', 's_pos', 's_obj',
                       'nb_words', 'nb_chars', 'rel_nb_words', 'rel_nb_chars',
                       'nb_senti_-1', 'nb_senti_0', 'nb_senti_1', 'nb_senti_2', 'nb_senti_3', 'nb_senti_4', 
                       'subj_0', 'subj_1', 'review_0']])
X = hstack((X_text, csr_matrix(X_other))).tocsr()


oh = OneHotEncoder().fit(data.Score.to_numpy().reshape(-1, 1))
y = oh.transform(data.Score.to_numpy().reshape(-1,1)).toarray()

print(X.shape, X_seq.shape)
print(y.shape)
del X_seq_list
# undersample_Xy(X, y)

In [None]:
# Word Embeddings Augmente

import nlpaug.augmenter.word as naw

text = 'The quick brown fox jumps over the lazy dog .'
# Substitute word by spelling mistake words dictionary
aug = naw.SpellingAug()
augmented_texts = aug.augment(text, n=3)

# Insert word randomly by word embeddings similarity
# model_type: word2vec, glove or fasttext
aug = naw.WordEmbsAug(
    model_type='word2vec', model_path=model_dir+'GoogleNews-vectors-negative300.bin',
    action="insert")
augmented_text = aug.augment(text)

# Substitute word by word2vec similarity

# model_type: word2vec, glove or fasttext
aug = naw.WordEmbsAug(
    model_type='word2vec', model_path=model_dir+'GoogleNews-vectors-negative300.bin',
    action="substitute")
augmented_text = aug.augment(text)

# Substitute word by TF-IDF similarity

aug = naw.TfIdfAug(
    model_path=os.environ.get("MODEL_DIR"),
    action="substitute")
augmented_text = aug.augment(text)

# Insert word by contextual word embeddings (BERT, DistilBERT, RoBERTA or XLNet)

aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="insert")
augmented_text = aug.augment(text)

# Substitute word by contextual word embeddings (BERT, DistilBERT, RoBERTA or XLNet)
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="substitute")
augmented_text = aug.augment(text)

# Substitute word by WordNet's synonym
aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(text)

# Substitute word by antonym
aug = naw.AntonymAug()
_text = 'Good boy'
augmented_text = aug.augment(_text)

# Back Translation Augmenter
import nlpaug.augmenter.word as naw

# text = 'The quick brown fox jumped over the lazy dog'
back_translation_aug = naw.BackTranslationAug(
    from_model_name='transformer.wmt19.en-de', 
    to_model_name='transformer.wmt19.de-en'
)
back_translation_aug.augment(text)