<a href="https://colab.research.google.com/github/sangithajk/Hackathon/blob/master/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
train = pd.read_csv("/content/drive/My Drive/review_train.csv")
test = pd.read_csv("/content/drive/My Drive/review_test.csv")
train["source"] = "train"
test["source"] = "test"
dataset = pd.concat([train,test])

In [0]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
dataset.head()

Unnamed: 0,Text,Score,Sentiment,source
0,"I got a free sample of these once, and now--we...",5,1,train
1,I used to get this Tea when I lived in Washing...,4,1,train
2,This is my all time favorite 'grab and go' sna...,5,1,train
3,This flavor is very good and unexpected. The ...,4,1,train
4,thrilled to have this assortment as i got the ...,4,1,train


In [0]:
dataset.shape

(18532, 4)

In [0]:
dataset.Sentiment.value_counts()

1    15637
0     2895
Name: Sentiment, dtype: int64

In [0]:
# build train and test datasets
reviews = dataset['Text'].values
sentiments = dataset['Sentiment'].values

train_reviews = train['Text'].values
train_sentiments = train['Sentiment'].values

test_reviews = test['Text'].values
test_sentiments = test['Sentiment'].values


In [0]:
!pip install contractions
!pip install textsearch
!pip install tqdm



In [0]:

import contractions
from bs4 import BeautifulSoup
import numpy as np
import re
import tqdm
import unicodedata


def strip_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  [s.extract() for s in soup(['iframe', 'script'])]
  stripped_text = soup.get_text()
  stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
  return stripped_text

def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

def pre_process_corpus(docs):
  norm_docs = []
  for doc in tqdm.tqdm(docs):
    doc = strip_html_tags(doc)
    doc = doc.translate(doc.maketrans("\n\t\r", "   "))
    doc = doc.lower()
    doc = remove_accented_chars(doc)
    doc = contractions.fix(doc)
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = re.sub(' +', ' ', doc)
    doc = doc.strip()  
    norm_docs.append(doc)
  
  return norm_docs

In [0]:
%%time

norm_train_reviews = pre_process_corpus(train_reviews)
norm_test_reviews = pre_process_corpus(test_reviews)

100%|██████████| 14825/14825 [00:03<00:00, 4368.40it/s]
100%|██████████| 3707/3707 [00:00<00:00, 4414.30it/s]

CPU times: user 4.21 s, sys: 40 ms, total: 4.25 s
Wall time: 4.25 s





In [0]:
%%time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=5, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)


# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=5, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(norm_train_reviews)

CPU times: user 6.9 s, sys: 131 ms, total: 7.03 s
Wall time: 7.05 s


In [0]:
%%time

# transform test reviews into features
cv_test_features = cv.transform(norm_test_reviews)
tv_test_features = tv.transform(norm_test_reviews)

CPU times: user 934 ms, sys: 377 µs, total: 934 ms
Wall time: 939 ms


In [0]:
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (14825, 36702)  Test features shape: (3707, 36702)
TFIDF model:> Train features shape: (14825, 36702)  Test features shape: (3707, 36702)


In [0]:
cv_train_features

<14825x36702 sparse matrix of type '<class 'numpy.int64'>'
	with 1386660 stored elements in Compressed Sparse Row format>

In [0]:
%%time

# Logistic Regression model on BOW features
from sklearn.linear_model import LogisticRegression

# instantiate model
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42)

# train model
lr.fit(cv_train_features, train_sentiments)

# predict on test data
lr_bow_predictions = lr.predict(cv_test_features)

CPU times: user 3.93 s, sys: 2.95 s, total: 6.89 s
Wall time: 3.54 s


In [0]:
from sklearn.metrics import confusion_matrix, classification_report


print(classification_report(test_sentiments, lr_bow_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, lr_bow_predictions))

              precision    recall  f1-score   support

           0       0.84      0.68      0.75       592
           1       0.94      0.97      0.96      3115

    accuracy                           0.93      3707
   macro avg       0.89      0.83      0.85      3707
weighted avg       0.92      0.93      0.92      3707



Unnamed: 0,0,1
0,404,188
1,79,3036


In [0]:
%%time

# Random Forest model on BOW features
from sklearn.ensemble import RandomForestClassifier

# instantiate model
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)

# train model
rf.fit(cv_train_features, train_sentiments)

# predict on test data
rf_bow_predictions = rf.predict(cv_test_features)

CPU times: user 24.9 s, sys: 27.8 ms, total: 24.9 s
Wall time: 12.8 s


In [0]:
print(classification_report(test_sentiments, rf_bow_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, rf_bow_predictions))

              precision    recall  f1-score   support

           0       0.99      0.17      0.29       592
           1       0.86      1.00      0.93      3115

    accuracy                           0.87      3707
   macro avg       0.93      0.59      0.61      3707
weighted avg       0.88      0.87      0.83      3707



Unnamed: 0,0,1
0,102,490
1,1,3114


In [0]:
%%time

# Random Forest model on TF-IDF features

# train model
rf.fit(tv_train_features, train_sentiments)

# predict on test data
rf_tfidf_predictions = rf.predict(tv_test_features)

CPU times: user 28.4 s, sys: 12.1 ms, total: 28.4 s
Wall time: 14.5 s


In [0]:

print(classification_report(test_sentiments, rf_tfidf_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, rf_tfidf_predictions))

              precision    recall  f1-score   support

           0       0.97      0.20      0.33       592
           1       0.87      1.00      0.93      3115

    accuracy                           0.87      3707
   macro avg       0.92      0.60      0.63      3707
weighted avg       0.88      0.87      0.83      3707



Unnamed: 0,0,1
0,119,473
1,4,3111


In [0]:
import gensim
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Activation, Dense
from sklearn.preprocessing import LabelEncoder

In [0]:
le = LabelEncoder()
# tokenize train reviews & encode train labels
tokenized_train = [nltk.word_tokenize(text)
                       for text in norm_train_reviews]
y_train = le.fit_transform(train_sentiments)
# tokenize test reviews & encode test labels
tokenized_test = [nltk.word_tokenize(text)
                       for text in norm_test_reviews]
y_test = le.fit_transform(test_sentiments)

In [0]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [0]:
%%time
# build word2vec model
w2v_num_features = 300
w2v_model = gensim.models.Word2Vec(tokenized_train, size=w2v_num_features, window=150,
                                   min_count=10, workers=4, iter=5)

2019-09-18 05:25:39,599 : INFO : collecting all words and their counts
2019-09-18 05:25:39,600 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-09-18 05:25:39,758 : INFO : PROGRESS: at sentence #10000, processed 786775 words, keeping 29049 word types
2019-09-18 05:25:39,839 : INFO : collected 36775 word types from a corpus of 1167142 raw words and 14825 sentences
2019-09-18 05:25:39,840 : INFO : Loading a fresh vocabulary
2019-09-18 05:25:40,170 : INFO : effective_min_count=10 retains 5110 unique words (13% of original 36775, drops 31665)
2019-09-18 05:25:40,171 : INFO : effective_min_count=10 leaves 1107625 word corpus (94% of original 1167142, drops 59517)
2019-09-18 05:25:40,194 : INFO : deleting the raw counts dictionary of 36775 items
2019-09-18 05:25:40,197 : INFO : sample=0.001 downsamples 57 most-common words
2019-09-18 05:25:40,198 : INFO : downsampling leaves estimated 790586 word corpus (71.4% of prior 1107625)
2019-09-18 05:25:40,221 : INFO : 

CPU times: user 1min 2s, sys: 53.7 ms, total: 1min 2s
Wall time: 32.2 s


In [0]:
def averaged_word2vec_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [0]:
# generate averaged word vector features from word2vec model
avg_wv_train_features = averaged_word2vec_vectorizer(corpus=tokenized_train, model=w2v_model,
                                                     num_features=w2v_num_features)
avg_wv_test_features = averaged_word2vec_vectorizer(corpus=tokenized_test, model=w2v_model,
                                                    num_features=w2v_num_features)

In [0]:
print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape, ' Test features shape:', avg_wv_test_features.shape)

Word2Vec model:> Train features shape: (14825, 300)  Test features shape: (3707, 300)


In [0]:
def construct_deepnn_architecture(num_input_features):
    dnn_model = Sequential()
    dnn_model.add(Dense(512, input_shape=(num_input_features,)))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(256))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(256))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(1))
    dnn_model.add(Activation('sigmoid'))

    dnn_model.compile(loss='binary_crossentropy', optimizer='adam',                 
                      metrics=['accuracy'])
    return dnn_model

In [0]:
w2v_dnn = construct_deepnn_architecture(num_input_features=w2v_num_features)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
w2v_dnn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               154112    
_________________________________________________________________
activation (Activation)      (None, 512)               0         
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               6

In [0]:
batch_size = 100
w2v_dnn.fit(avg_wv_train_features, y_train, epochs=10, batch_size=batch_size, 
            shuffle=True, validation_split=0.1, verbose=1)

Train on 13342 samples, validate on 1483 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f7585519470>

In [0]:
y_pred = w2v_dnn.predict_classes(avg_wv_test_features)
predictions = le.inverse_transform(y_pred)

  y = column_or_1d(y, warn=True)


In [0]:

print(classification_report(test_sentiments, predictions))
pd.DataFrame(confusion_matrix(test_sentiments, predictions))

              precision    recall  f1-score   support

           0       0.63      0.54      0.58       592
           1       0.92      0.94      0.93      3115

    accuracy                           0.88      3707
   macro avg       0.77      0.74      0.76      3707
weighted avg       0.87      0.88      0.87      3707



Unnamed: 0,0,1
0,321,271
1,185,2930
