# Word2Vec & Logistic Regression

In [1]:
# !pip install wordcloud
!pip install imblearn
!pip install gensim
!pip install keras
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [3]:
# Imports required libraries

# for data wrangling
import numpy as np
import pandas as pd
import re, datetime
import string
import multiprocessing

# for NLP / sentiment analysis
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

# for visualization
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
# from wordcloud import WordCloud

# for model building
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
import gensim
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors

#Keras:
import keras
from keras.utils import pad_sequences
from keras.preprocessing.text import one_hot,Tokenizer
# from keras-preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input,CuDNNLSTM,LSTM
from keras.models import Model
from keras.preprocessing.text import text_to_word_sequence

# for local helper funtions
import helper_module

# for exporting cleaned data
import os
from os.path import join
from joblib import dump, load


[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
2023-04-26 20:45:40.460273: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-26 20:45:40.760650: I tensorflow/c

# Import Data
Data cleaned using Google Colab

In [4]:
reviews_df = pd.read_csv("Reviews_cleanText_noSW_sageMakerLocal.csv")
len(reviews_df)

394052

Convert review text into a list of tokens

In [None]:
def review_to_words(review):
  word_split = review.split()
  words = [w for w in word_split]   
  return (words)

In [None]:
reviews_df['cleaned_text_list'] = reviews_df['cleaned_text'].apply(review_to_words)
reviews_df.head()

Split train & test data, also up-sampling minority class

In [None]:
%%time

X_train, X_test, y_train, y_test = train_test_split(reviews_df["cleaned_text_list"], 
                                                    reviews_df["Score_class"], 
                                                    test_size=0.2,
                                                    random_state=42)
ros = RandomOverSampler(random_state=42)
X_res_train, y_res_train = ros.fit_resample(X_train.array.reshape(-1, 1), y_train)

X_res_train = pd.Series(X_res_train[:,0])

print("%d items in training data, %d in test data" % (len(X_res_train), len(X_test)))

# Model Building: Embedding

Observe the number of cores available for use

In [None]:
core_count = multiprocessing.cpu_count() # Count the number of cores in a computer
print(f'{core_count} cores are available for use')

**The code below for "model" was manually adjusted for every run of the hyper-parameter tuning:**

min_count: [10, 20, 50]

window: [2, 4]

In [None]:
#initialize word to vec model 
model = Word2Vec(min_count=10,
                 window=4,
                 vector_size=300,
                 sample=6e-5, 
                 workers= 4)

In [None]:
%%time
# build the vocabulary table 
model.build_vocab(X_res_train)


In [None]:
%%time
# train the model using X_res_train
model.train(X_res_train, 
            total_examples = model.corpus_count, 
            epochs=10, 
            report_delay=1)


In [None]:
# Saving trained KeyedVectors
from gensim.models import KeyedVectors
model.wv.save('word2vec_min10_win4.kv')
# reloaded_word_vectors = KeyedVectors.load('word2vec_50.kv')

In [None]:
%%time 
#https://www.kaggle.com/code/arunava21/word2vec-and-random-forest-classification

def make_feature_vec(words, model, num_features):
    """
    Average the word vectors for a set of words
    """
    feature_vec = np.zeros((num_features,),dtype="float32")  # pre-initialize (for speed)
    
    nwords = 0
    index2word_set = set(model.wv.key_to_index)  # words known to the model

    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec,model.wv[word])
    
    if nwords == 0: #when it comes to summary, words used might not be in the model trained
        nwords = 1

    feature_vec = np.divide(feature_vec, nwords)
    return feature_vec


def get_avg_feature_vecs(reviews, model, num_features):
    """
    Calculate average feature vectors for all reviews
    """
    counter = 0
    review_feature_vecs = np.zeros((len(reviews),num_features), dtype='float32')  # pre-initialize (for speed)
    
    for review in reviews:
        review_feature_vecs[counter] = make_feature_vec(review, model, num_features)
        counter = counter + 1
    return review_feature_vecs

In [None]:
%%time
# calculate average feature vectors for training and test sets
clean_train_reviews = []
for review in X_res_train:
    clean_train_reviews.append(review)
trainDataVecs = get_avg_feature_vecs(clean_train_reviews, model, 300)

In [None]:
%%time
clean_test_reviews = []
for review in X_test:
    clean_test_reviews.append(review)
testDataVecs = get_avg_feature_vecs(clean_test_reviews, model, 300)

In [None]:
# trainDataVecs_df = pd.DataFrame(trainDataVecs)

In [None]:
# testDataVecs_df = pd.DataFrame(testDataVecs)

In [None]:
# testDataVecs_df.to_csv("testDataVecs.csv", index = False)
# trainDataVecs_df.to_csv("trainDataVecs_df.csv", index = False)

# Model Building: Prediction

In [None]:
%%time
logreg = LogisticRegression(max_iter = 1000)
logreg = logreg.fit(trainDataVecs, y_res_train)
prediction = dict()
prediction['Logistic'] = logreg.predict(testDataVecs)

In [None]:
auc, acc, f1, recall, cm = helper_module.model_eval(y_test, prediction["Logistic"])

In [None]:
# save model result
name = "W2V_LogReg_minWord10_win4"
helper_module.add_model_scores_to_results(file_path = 'Model_results.csv', 
                            model_name = name, 
                            datashift = 'test', 
                            with_sw = 0,
                            ROC_AUC = auc, accuracy = acc, 
                            f1 = f1, recall=recall, cm = cm, first_entry=False)

# Data Shift

In [None]:
# Summary Performance
X_train_sum, X_test_sum, y_train_sum, y_test_sum = train_test_split(reviews_df["cleaned_summary"], 
                                                    reviews_df["Score_class"], 
                                                    test_size=0.2,
                                                    random_state=42)

clean_test_summary = []
for review in X_test_sum:
    clean_test_summary.append(review)
    
testSumVecs = get_avg_feature_vecs(clean_test_summary, model, 300)

# predict on summary
prediction = dict()
prediction['Logistic'] = logreg.predict(testSumVecs)

# get prediction scores
auc, acc, f1, recall, cm = helper_module.model_eval(y_test, prediction["Logistic"])

# save result
name = "W2V_LogReg_minWord10_win4"
helper_module.add_model_scores_to_results(file_path = 'Model_results.csv', 
                            model_name = name, 
                            datashift = 'summary', 
                            with_sw = 0,
                            ROC_AUC = auc, accuracy = acc, 
                            f1 = f1, recall=recall, cm = cm, first_entry=False)

In [None]:
# dropout performance
X_train, X_test, y_train, y_test = train_test_split(reviews_df["cleaned_text"], 
                                                    reviews_df["Score_class"], 
                                                    test_size=0.2,
                                                    random_state=42)
for i in [0.1, 0.25, 0.5]:
    X_test_dropout = X_test.apply(helper_module.random_dropout,
                                  p=i, random_state=42)
    X_test_dropout_list = X_test_dropout.apply(review_to_words)
    clean_test_review = []
    for review in X_test_dropout_list:
        clean_test_review.append(review)
    testDataVecs = get_avg_feature_vecs(clean_test_review, model, 300)
    
    prediction = dict()
    prediction['Logistic'] = logreg.predict(testDataVecs)
    
    auc, acc, f1, recall, cm = helper_module.model_eval(y_test, prediction["Logistic"])
    name = f'W2V_LogReg_minWord10_win+Dropout{i}'
    helper_module.add_model_scores_to_results(file_path = 'Model_results.csv', 
                                model_name = name,
                                datashift = f'dropout_{i}', with_sw = 0,
                                ROC_AUC = auc, accuracy = acc, 
                                f1 = f1, recall=recall, cm = cm, first_entry=False)

# Build Embedding for LSTM

In [None]:
# max_doc_len=1951  # max lenght of a review
vocab_size = len(tok.word_index) + 1  # total no of words
embed_dim = 300 # embedding dimension as choosen in word2vec constructor
print(f'Total number of words tokenized: {vocab_size}.')

In [None]:
# total number of extracted words learned by the Word2Vec model.
vocab = model.wv.index_to_key
print("The total number of words learned by Word2Vec model is : ",len(vocab))

In [None]:
word_vec_dict={}
for word in vocab:
    word_vec_dict[word]=model.wv.get_vector(word)
print("The no of key-value pairs : ",len(word_vec_dict)) # should come equal to vocab size
  

In [None]:
# now creating the embedding matrix
embed_matrix = np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.word_index.items():
    embed_vector = word_vec_dict.get(word)
    if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
        embed_matrix[i]=embed_vector
  # if word is not found then embed_vector corressponding to that vector will stay zero.
print("The shape of embed_matrix : ",embed_matrix.shape)

In [None]:
# save the embedding matrix into a file
embed_matrix_df = pd.DataFrame(embed_matrix)
embed_matrix_df.to_csv("embed_matrix.csv", index = False)
# embed_matrix = pd.read_csv("mbed_matrix.csv").to_numpy()

# References:

**Gensim Word2Vec Tutorial** 
Provides the basic guidelines for Word2Vec
https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial 

**word2vec and random forest classification** 
Provides the basic guidelines for Word2Vec
https://www.kaggle.com/code/arunava21/word2vec-and-random-forest-classification 

**Amazon Fine Food Reviews: Sentiment Analysis.**
Provides the basic guidelines for LSTM. https://www.kaggle.com/code/chirag9073/amazon-fine-food-reviews-sentiment-analysis 



In [None]:
ros = RandomOverSampler(random_state=42)
X_res_train, y_res_train = ros.fit_resample(x_train, y_train)

# X_res_train = pd.Series(X_res_train[:,0])

# print("%d items in training data, %d in test data" % (len(X_res_train), len(X_test)))


In [None]:
# https://www.kaggle.com/code/chirag9073/amazon-fine-food-reviews-sentiment-analysis
def build_model(embedding_matrix):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(LSTM(256, return_sequences=True))(x)
 
    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = Dense(512, activation='relu')(hidden)
    
    result = Dense(5, activation='softmax')(hidden)
    
    model = Model(inputs=words, outputs=result)
    model.compile(
        loss='categorical_crossentropy', 
        optimizer='adam',
        metrics=['accuracy']
    )

    return model

In [None]:
model = build_model(embedding_matrix)
model.summary()

checkpoint = ModelCheckpoint(
    'model.h5', 
    monitor='val_acc', 
    verbose=1, 
    save_best_only=True, 
    save_weights_only=False,
    mode='auto'
)

history = model.fit(
    x_train,
    y_train,
    batch_size=512,
    callbacks=[checkpoint],
    epochs=10,
    validation_split=0.1
)

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['val_accuracy'])
plt.plot(history.history['accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.show()

In [None]:
# Plot training & validation loss values
plt.plot(history.history['val_loss'])
plt.plot(history.history['loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.show()

In [6]:
%%time

X_train, X_test, y_train, y_test = train_test_split(reviews_df["cleaned_text"], 
                                                    reviews_df["Score_class"], 
                                                    test_size=0.2,
                                                    random_state=42)
ros = RandomOverSampler(random_state=42)
X_res_train, y_res_train = ros.fit_resample(X_train.array.reshape(-1, 1), y_train)

# X_res_train = pd.Series(X_res_train[:,0])

print("%d items in training data, %d in test data" % (len(X_res_train), len(X_test)))

492018 items in training data, 78811 in test data
CPU times: user 171 ms, sys: 35.9 ms, total: 206 ms
Wall time: 207 ms


In [8]:
!pip install flair

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting flair
  Downloading flair-0.12.2-py3-none-any.whl (373 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m373.1/373.1 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]>=4.18.0
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m98.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m99.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sqlitedict>=1.6.0
  Downloading

In [22]:
from flair.data import Sentence
from flair.nn import Classifier

In [18]:
# load the NER tagger
tagger = Classifier.load('en-sentiment')

def predict(x):
    # make a sentence
    sentence = Sentence(x) 
    pred = tagger.predict(sentence)
    return pred
    

FileNotFoundError: [Errno 2] No such file or directory: 'en-sentiment'

In [14]:
tagger = Classifier.load('sentiment')



FileNotFoundError: [Errno 2] No such file or directory: 'sentiment'

In [13]:
sentence = Sentence("the food is bad")

# run NER over sentence
pred = tagger.predict(sentence)

# print the sentence with all annotations
print(sentence)
print(pred)

NameError: name 'tagger' is not defined