## P1 Sentimental Analysis
**CSC791 - Fall 2020 term**       
**Shreyas Muralidhara (schikkb)**

Implementing word vectors to classify sentences based on the sentiment they express.

In [113]:
#Identify the necessary libraries
import numpy as np
import pandas as pd
import xlrd
import nltk
nltk.download('punkt')
nltk.download('wordnet')
import re
import gensim

from gensim.utils import simple_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score,accuracy_score

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.preprocessing import normalize, StandardScaler, RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC,LinearSVC
from sklearn.linear_model import LogisticRegression

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#Import training data into the dataframe
input_df = pd.read_excel("P1_training.xlsx", index_column=None,header=0)

### Baseline 1 - Word2Vec
#### Step 1 - Tokenize sentences into word tokens

In [None]:
token_sentences = []
for sentence in input_df['sentence']:
    token_sentences.append(list(gensim.utils.simple_tokenize(sentence)))
input_df['word_tokens'] = token_sentences

#printing label distribution of the input data
print('Label distribution for the training dataset:\n',input_df['label'].value_counts())

# Split the data into train and validation
X_train, X_val, Y_train, Y_val = train_test_split(input_df['word_tokens'], input_df['label'], test_size=0.2, shuffle = True, stratify = input_df['label'], random_state=0)

print(X_train.shape, X_val.shape, Y_train.shape, Y_val.shape)

Label distribution for the training dataset:
 1    736
2    661
0    263
Name: label, dtype: int64
(1328,) (332,) (1328,) (332,)


#### Step 2 - Compute word vectors for training and validation sets, using word2vec-skip gram, for each word token and average these word vectors to generate the vector for the sentence. 

In [None]:
# Compute word vectors for the train dataset
w2vmodel = Word2Vec(X_train,size=100,window=5,min_count=4,sg=1)
print('Details of the model generated -',w2vmodel)

#Average the word vectors generated for sentence vector
X_train_vec = []
for sent_token in X_train:
    #count the words for which the vectors were generated  
    ctr = 0
    word_token = np.empty((100),int)
    for word in sent_token:
      if word in w2vmodel.wv.vocab:
        word_token = word_token + np.array(w2vmodel[word])
        ctr = ctr + 1  
    
    word_token = word_token/ctr
    X_train_vec.append(word_token)
    
X_train_vec = pd.DataFrame(X_train_vec)
print('Shape of training set sentence vectors -',X_train_vec.shape)


#Average the word vectors generated for sentence vector
X_val_vec = []
for sent_token in X_val:
    #count the words for which the vectors were generated  
    ctr = 0
    word_token = np.empty((100),int)
    for word in sent_token:
      if word in w2vmodel.wv.vocab:
        word_token = word_token + np.array(w2vmodel[word])
        ctr = ctr + 1  
    
    word_token = word_token/ctr
    X_val_vec.append(word_token)
    
X_val_vec = pd.DataFrame(X_val_vec)
print('Shape of valuidation set sentence vectors -',X_val_vec.shape)

Details of the model generated - Word2Vec(vocab=2188, size=100, alpha=0.025)


  del sys.path[0]


Shape of training set sentence vectors - (1328, 100)
Shape of valuidation set sentence vectors - (332, 100)




#### Step 3 - Train and validate using the classifier for each sentence in 0,1 & 2

In [None]:
# Fit the model for the classifier
clf_word2vec = DecisionTreeClassifier().fit(X_train_vec, Y_train)

# Predicting the class labels for validation data
Y_val_pred = clf_word2vec.predict(X_val_vec)

print('Baseline Model 1 - Word2Vec validation metrics:\nAccuracy -',round(accuracy_score(Y_val,Y_val_pred),4))
print('f1 score -', round(f1_score(Y_val,Y_val_pred,labels=None, pos_label=1, average='weighted'),4))
print('Classification Report:\n',classification_report(Y_val, Y_val_pred))

Baseline Model 1 - Word2Vec validation metrics:
Accuracy - 0.4759
f1 score - 0.4323
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        53
           1       0.48      0.67      0.56       147
           2       0.48      0.45      0.46       132

    accuracy                           0.48       332
   macro avg       0.32      0.37      0.34       332
weighted avg       0.40      0.48      0.43       332



### Testing the Baseline1 - Word2vec model and storing the result.

In [None]:
#importing the test data into the dataframe
test_df = pd.read_excel("P1_testing.xlsx", index_column=None,header=0)

#tokenize test sentences into word token
token_sentences = []
for sentence in test_df['sentence']:
    token_sentences.append(list(gensim.utils.simple_tokenize(sentence)))

#Average the word vectors generated for sentence vector
test_vec = []
for sent_token in token_sentences:
    #count the words for which the vectors were generated  
    ctr = 0
    word_token = np.empty((100),int)
    for word in sent_token:
      if word in w2vmodel.wv.vocab:
        word_token = word_token + np.array(w2vmodel[word])
        ctr = ctr + 1  
    
    word_token = word_token/ctr
    test_vec.append(word_token)
    
test_vec = pd.DataFrame(test_vec)
#print('Shape of test set sentence vectors -',test_vec.shape)

#printing label distribution of the input data
#print('Label distribution for the test dataset:\n',input_df['label'].value_counts())

# Predicting the class labels for validation data
Y_test_pred = clf_word2vec.predict(test_vec)

print('Baseline Model 1 - Word2Vec Test metrics:\nAccuracy -',round(accuracy_score(test_df['label'],Y_test_pred),4))
print('f1 score -', round(f1_score(test_df['label'],Y_test_pred,labels=None, pos_label=1, average='weighted'),4))
print('Classification Report:\n',classification_report(test_df['label'], Y_test_pred))

test_df = test_df.rename(columns={'label':'gold_label'})
test_df['predicted_label'] = Y_test_pred
test_df.to_csv('testing_output_word2vec.csv', index=False)

  _warn_prf(average, modifier, msg_start, len(result))


Baseline Model 1 - Word2Vec Test metrics:
Accuracy - 0.4524
f1 score - 0.3335
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        82
           1       0.45      0.93      0.60       303
           2       0.52      0.09      0.15       298

    accuracy                           0.45       683
   macro avg       0.32      0.34      0.25       683
weighted avg       0.43      0.45      0.33       683



### Baseline 2 - Tf-Idf (Term frequency - Inverse Document frequency)
#### Step 1 - Convert the collection of sentences to matrix of token counts using Count Vectorizer

#### Step 2 - Transform the count matrix into normalized TF or TF-IDF form.

#### Step 3 - Train and validate using the classifier for each sentence

In [None]:
# Split the sentences into training and validation by stratifying the samples.
X_train, X_val, Y_train, Y_val = train_test_split(input_df['sentence'], input_df['label'], test_size=0.2, shuffle = True, stratify = input_df['label'], random_state=0)
#print(X_train.shape, X_val.shape, Y_train.value_counts(), Y_val.value_counts())

# Step 1 - Generate the token matrix
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_val_counts = count_vect.transform(X_val)

# Step 2 - Transform the count matrix to TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_val_tfidf = tfidf_transformer.transform(X_val_counts)

# Step 3 - Train and validate the model using Naive Bayes Classifier
clf = MultinomialNB().fit(X_train_tfidf, Y_train)
Y_val_pred = clf.predict(X_val_tfidf)

print('Baseline Model 2 - TF-IDF validation metrics:\nAccuracy -',round(accuracy_score(Y_val,Y_val_pred),4))
print('f1 score -', round(f1_score(Y_val,Y_val_pred,labels=None, pos_label=1, average='weighted'),4))
print('Classification Report:\n',classification_report(Y_val, Y_val_pred))


Baseline Model 2 - TF-IDF validation metrics:
Accuracy - 0.5151
f1 score - 0.4707
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        53
           1       0.54      0.62      0.58       147
           2       0.49      0.61      0.54       132

    accuracy                           0.52       332
   macro avg       0.34      0.41      0.37       332
weighted avg       0.43      0.52      0.47       332



  _warn_prf(average, modifier, msg_start, len(result))


### Testing the Baseline 2 - Tf-Idf (Term frequency inverse document frequency) and storing the result.

In [144]:
#importing the test data into the dataframe
test_df = pd.read_excel("P1_testing.xlsx", index_column=None,header=0)

X_test_counts = count_vect.transform(test_df['sentence'])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

Y_test_pred = clf.predict(X_test_tfidf)

print('Baseline Model 2 - TF-IDF Test metrics:\nAccuracy -',round(accuracy_score(test_df['label'],Y_test_pred),4))
print('f1 score -', round(f1_score(test_df['label'],Y_test_pred,labels=None, pos_label=1, average='weighted'),4))
print('Classification Report:\n',classification_report(test_df['label'], Y_test_pred))

test_df = test_df.rename(columns={'label':'gold_label'})
test_df['predicted_label'] = Y_test_pred
test_df.to_csv('testing_output_TF-IDF.csv', index=False)

Baseline Model 2 - TF-IDF Test metrics:
Accuracy - 0.5944
f1 score - 0.5562
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        82
           1       0.59      0.71      0.64       303
           2       0.60      0.64      0.62       298

    accuracy                           0.59       683
   macro avg       0.40      0.45      0.42       683
weighted avg       0.52      0.59      0.56       683



  _warn_prf(average, modifier, msg_start, len(result))


### Proposed Solution - Doc2Vec Model

Preprocessing the input data includes
  1. Removing Punctuations and replacing by blanks
  2. Perform stemming using Porter Stemmer
  3. Perform wordnet lemmatozation using nltk.wordnet

In [134]:
# Perform text Preprocessing - Stemming using Porter Stemmer
preprocessed_sentence = []
for sentence in input_df['sentence']:
    lemma_sentence = []
    # Removing punctuations from the sentence
    sentence = re.sub(r'[^0-9A-Za-z]+', ' ', sentence)
    for word in word_tokenize(sentence):
        # Replace the word with stem word 
        stem_word = PorterStemmer().stem(word)

        #Performing lemmmatization on stem words
        lemma_sentence.append(WordNetLemmatizer().lemmatize(stem_word))
        
    preprocessed_sentence.append(lemma_sentence)


#Split the input data into Training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(preprocessed_sentence, input_df['label'], test_size=0.2, shuffle = True, stratify = input_df['label'], random_state=0)


#### Step 1 - Represent each sentence as tagged dcoument containing list of words and their associated tags in it.

#### Step 2 - Define the model and build the vocab using training set.

#### Step 3 - Generate the document vectors for training and validation dataset.

In [137]:
# Step 1 - Generating tagged documents with list of words and their associated tags
X_train_tagged = [TaggedDocument(d, [i]) for i, d in enumerate(X_train)]
X_val_tagged = [TaggedDocument(d, [i]) for i, d in enumerate(X_val)]


# Step 2 - Define the model and build the vocab using training set
d2vmodel = Doc2Vec(min_count =1,vector_size=100, epochs=20)
d2vmodel.build_vocab(X_train_tagged)

# Step 3 - Generate the document vectors for training and validation dataset
X_train_doc_vectors = []
for d in X_train_tagged:  
    X_train_doc_vectors.append(d2vmodel.infer_vector(d.words))

X_train_doc_vectors = pd.DataFrame(X_train_doc_vectors)

X_val_doc_vectors = []
for d in X_val_tagged:
    X_val_doc_vectors.append(d2vmodel.infer_vector(d.words))

X_val_doc_vectors = pd.DataFrame(X_val_doc_vectors)

print(X_train_doc_vectors.shape,X_val_doc_vectors.shape)

(1328, 100) (332, 100)


#### Step 4 - Train and validate the model using the sentence vectors

In [138]:
# Fit the model for the classifier
clf_word2vec = SVC().fit(X_train_doc_vectors, Y_train)

# Predicting the class labels for validation data
Y_val_pred = clf_word2vec.predict(X_val_doc_vectors)

print('Baseline Model 1 - Word2Vec validation metrics:\nAccuracy -',round(accuracy_score(Y_val,Y_val_pred),4))
print('f1 score -', round(f1_score(Y_val,Y_val_pred,labels=None, pos_label=1, average='weighted'),4))
print('Classification Report:\n',classification_report(Y_val, Y_val_pred))

Baseline Model 1 - Word2Vec validation metrics:
Accuracy - 0.4699
f1 score - 0.424
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        53
           1       0.48      0.67      0.56       147
           2       0.46      0.43      0.45       132

    accuracy                           0.47       332
   macro avg       0.31      0.37      0.33       332
weighted avg       0.39      0.47      0.42       332



  _warn_prf(average, modifier, msg_start, len(result))


### Testing the Proposed Solution -  Doc2Vec(with Lemmatization and Stemming ) and storing the result.

In [146]:
#importing the test data into the dataframe
test_df = pd.read_excel("P1_testing.xlsx", index_column=None,header=0)

# Perform text Preprocessing - Stemming using Porter Stemmer
preprocessed_sentence = []
for sentence in test_df['sentence']:
    lemma_sentence = []
    # Removing punctuations from the sentence
    sentence = re.sub(r'[^0-9A-Za-z]+', ' ', sentence)
    for word in word_tokenize(sentence):
        # Replace the word with stem word 
        stem_word = PorterStemmer().stem(word)

        #Performing lemmmatization on stem words
        lemma_sentence.append(WordNetLemmatizer().lemmatize(stem_word))
        
    preprocessed_sentence.append(lemma_sentence)


# Generating tagged documents for test data
X_test_tagged = [TaggedDocument(d, [i]) for i, d in enumerate(preprocessed_sentence)]

# Generate document vectors for test dataset
X_test_doc_vectors = []
for d in X_test_tagged:  
    X_test_doc_vectors.append(d2vmodel.infer_vector(d.words))

X_test_doc_vectors = pd.DataFrame(X_test_doc_vectors)

print('Proposed Solution 2 - Doc2Vec Test metrics:\nAccuracy -',round(accuracy_score(test_df['label'],Y_test_pred),4))
print('f1 score -', round(f1_score(test_df['label'],Y_test_pred,labels=None, pos_label=1, average='weighted'),4))
print('Classification Report:\n',classification_report(test_df['label'], Y_test_pred))

test_df = test_df.rename(columns={'label':'gold_label'})
test_df['predicted_label'] = Y_test_pred
test_df.to_csv('testing_output_Doc2Vec.csv', index=False)

Proposed Solution 2 - Doc2Vec Test metrics:
Accuracy - 0.5944
f1 score - 0.5562
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        82
           1       0.59      0.71      0.64       303
           2       0.60      0.64      0.62       298

    accuracy                           0.59       683
   macro avg       0.40      0.45      0.42       683
weighted avg       0.52      0.59      0.56       683



  _warn_prf(average, modifier, msg_start, len(result))
