# TF-IDF 
### Mobile Review

### Introduction

### Import required packages

In [2]:
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report


In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

**Load dataset**

In [3]:
# Load dataset
_df = pd.read_csv('../data/amazon_reviews_processed.csv').dropna()
_df.drop(_df.columns[_df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
_df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Sentiment
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,"feel lucky found used phone us used hard all, ...",1.0,POSITIVE
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice grade pantach revue. clean se...",0.0,POSITIVE
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,pleased,0.0,POSITIVE
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,works good goes slow sometimes good phone love,0.0,POSITIVE
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,great phone replace lost phone. thing volume b...,0.0,POSITIVE


In [4]:
_df2 = pd.read_csv('../data/flipkar_reviews_processed.csv').dropna()
_df2.drop(_df2.columns[_df2.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
_df2.head()

Unnamed: 0,Product Name,Reviews,Rating,Sentiment
0,"SAMSUNG Galaxy M01 (Black, 32 GB)",recently gifted cell dad... obvious choice 1. ...,4,POSITIVE
1,"SAMSUNG Galaxy M01 (Black, 32 GB)",truly satisfied performance phone.first budget...,5,POSITIVE
2,"SAMSUNG Galaxy M01 (Black, 32 GB)",gifted mom... good normal user... heavy user g...,4,POSITIVE
3,"SAMSUNG Galaxy M01 (Black, 32 GB)",good phone extremely liked good perfomance sup...,5,POSITIVE
4,"SAMSUNG Galaxy M01 (Black, 32 GB)",phone good simple purpose still value money ma...,3,NEUTRAL


In [None]:
# Load dataset
_df = pd.read_csv('../data/mobile_reviews.csv').dropna()
_df.drop(_df.columns[_df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
_df.head()

In [None]:
_df["Product Name"].value_counts()

In [4]:
_df["Rating"].value_counts()

5    347280
4    108560
1     91000
3     50200
2     32399
Name: Rating, dtype: int64

In [5]:
_df["Sentiment"].value_counts()

POSITIVE    455840
NEGATIVE    123399
NEUTRAL      50200
Name: Sentiment, dtype: int64

**TF-IDF**

In [5]:
def contractions(s):
 s = re.sub(r"won’t", "will not",s)
 s = re.sub(r"can\’t", "can not",s)
 s = re.sub(r"n\’t", " not", s)
 s= re.sub(r"\’re", " are", s)
 s = re.sub(r"\’s", " is", s)
 s = re.sub(r"\’ll", " will", s)
 s = re.sub(r"\’t", " not", s)
 s = re.sub(r"\’ve", " have", s)
 s = re.sub(r"\’m", " am", s)
 return s

In [6]:
def preprocess(df):
    #lower case word by word
    df["pre_process"] = df["Reviews"].apply(lambda x: "".join(x.lower() for x in str(x).split()))
    #Beautify Text
    df["pre_process"] = df["pre_process"].apply(lambda x: BeautifulSoup(x).get_text())
    #Remove URL
    df["pre_process"] = df["pre_process"].apply(lambda x: re.sub(r"http\S+", "", x))
    #Expand contracted word
    df["pre_process"] = df["pre_process"].apply(lambda x:contractions(x))
    #Remove non-alpha characters
    df["pre_process"] = df["pre_process"].apply(lambda x: " ".join([re.sub("[^A-Za-z]+","", x) for x in nltk.word_tokenize(x)]))
    #Remove the extra spaces between the word_s
    df["pre_process"] = df["pre_process"].apply(lambda x: re.sub(" +", " ", x))
    stop = stopwords.words('english')
    df["pre_process"]= df["pre_process"].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))
    lemmatizer = WordNetLemmatizer()
    df["pre_process"]= df["pre_process"].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))
    return df

In [7]:
_df = preprocess(_df)

In [8]:
_df2 = preprocess(_df2)

In [27]:
_df['Sentiment'].value_counts()

POSITIVE    80184
NEGATIVE    35903
NEUTRAL     11787
Name: Sentiment, dtype: int64

In [9]:
#X_train,X_test,Y_train, Y_test = train_test_split(_df["pre_process"], _df["Sentiment"], test_size=0.25, random_state=30)
X_train = _df['pre_process']
Y_train = _df['Sentiment']
X_test = _df2['pre_process']
Y_test = _df2['Sentiment']

print("Train: ",X_train.shape,Y_train.shape," Test: ",(X_test.shape,Y_test.shape))

Train:  (127874,) (127874,)  Test:  ((125852,), (125852,))


In [11]:
vectorizer= TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)

In [12]:
tf_x_train.shape

(127874, 324316)

In [13]:
tf_x_test.shape

(125852, 324316)

In [15]:
#SVM
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.utils import class_weight
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [47]:
SVMModel = Pipeline(steps=[('SVC', SVC(gamma='scale'))])
param_grid= {'SVC__C': [0.1, 1],
             'SVC__degree': [1]}

BestSVMModel = GridSearchCV(SVMModel, param_grid, cv=5).fit(tf_x_train, Y_train)
print(f'Best Parameters: {BestSVMModel.best_params_}')

In [18]:
model2 = SVC(gamma='scale', C=0.1, degree=1).fit(tf_x_train, Y_train)
y_test_pred = model2.predict(tf_x_test)
print(classification_report(Y_test, y_test_pred))

In [None]:
y_test_pred=BestSVMModel.predict(tf_x_test)
print(classification_report(Y_test, y_test_pred))

In [None]:
LRModel = Pipeline(steps=[('logistic', LogisticRegression(random_state=0))])
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(Y_train), y=Y_train)

In [42]:
class_weights

array([1.18721741, 3.61624388, 0.53158569])

In [43]:
param_grid= {'logistic__C': [0.1, 1, 10]}
BestLRModel2 = GridSearchCV(LRModel, param_grid, cv=5).fit(tf_x_train, Y_train)
print(f'Best Parameters: {BestLRModel2.best_params_}')

Best Parameters: {'logistic__C': 10}


In [44]:
y_test_pred = BestLRModel2.predict(tf_x_test)
print(classification_report(Y_test, y_test_pred))

              precision    recall  f1-score   support

    NEGATIVE       0.54      0.08      0.13     21092
     NEUTRAL       0.17      0.03      0.04     11063
    POSITIVE       0.76      0.98      0.85     93697

    accuracy                           0.74    125852
   macro avg       0.49      0.36      0.34    125852
weighted avg       0.67      0.74      0.66    125852



**N-gram**

In [None]:
for gram in range(2,5):
  ng_vectorizer = CountVectorizer(ngram_range=(1, gram))
  X_train_ng = ng_vectorizer.fit_transform(X_train)
  X_test_ng = ng_vectorizer.transform(X_test)
  clf_ng = MultinomialNB()
  clf_ng.fit(X_train_ng, Y_train)

  accuracy = clf_ng.score(X_test_ng, Y_test)
  y_pred = clf_ng.predict(X_test_ng)
  print("Number of features is %s, Accuracy for %s-gram is %s" %(X_train_ng.shape[1], gram, accuracy))
  print(classification_report(Y_test, y_pred))

In [12]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
#import wordcloud

In [21]:
tweet_df = _df[_df['Sentiment'] != 'NEUTRAL']
tweet_df2 = _df2[_df2['Sentiment'] != 'NEUTRAL']

In [42]:
max_fatures = 2000
embedding_vector_length = 32

tokenizer = Tokenizer(num_words=max_fatures, split=' ')

tokenizer.fit_on_texts(tweet_df['pre_process'].values)
encoded_docs  = tokenizer.texts_to_sequences(tweet_df['pre_process'].values)
X = pad_sequences(encoded_docs , maxlen=200)

tokenizer.fit_on_texts(tweet_df2['pre_process'].values)
encoded_docs2  = tokenizer.texts_to_sequences(tweet_df2['pre_process'].values)
X2 = pad_sequences(encoded_docs2 , maxlen=200)

vocab_size = len(tokenizer.word_index) + 1

In [26]:
sentiment_label = tweet_df.Sentiment.factorize()
sentiment_label2 = tweet_df2.Sentiment.factorize()

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [39]:
#X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X,sentiment_label, test_size = 0.33, random_state = 42)
X_train2 = X
Y_train2 = sentiment_label[0]

X_test2 = X2
Y_test2 = sentiment_label2[0]
print("Train: ", X_train2.shape,len(Y_train2)," Test: ", X_test2.shape,len(Y_test2))

callback = EarlyStopping(monitor='loss', patience=3)

Train:  (116087,) 116087  Test:  (114789,) 114789


In [45]:
model.fit(X, sentiment_label[0], epochs = 1, batch_size=32, callbacks=[callback])



<keras.callbacks.History at 0x1f15f47a8c0>

In [46]:
model.save('my_model.h5')

In [65]:
model.evaluate(X2,sentiment_label2[0])



[0.5439526438713074, 0.8010436296463013]

In [66]:
from tensorflow import keras
model = keras.models.load_model('my_model.h5')

In [47]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 32)           9308640   
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 200, 32)          0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 50)                16600     
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 9,325,291
Trainable params: 9,325,291
Non-trainable params: 0
____________________________________________

In [48]:
y_pred = model.predict(X2)
y_pred

array([[0.36847445],
       [0.36847445],
       [0.369689  ],
       ...,
       [0.15601194],
       [0.36847445],
       [0.36847445]], dtype=float32)

In [49]:
sentiment_label2[0]

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [59]:
y_pred

array([[0.36847445],
       [0.36847445],
       [0.369689  ],
       ...,
       [0.15601194],
       [0.36847445],
       [0.36847445]], dtype=float32)

In [63]:
some=np.round(y_pred).astype(int)

In [179]:
#predict_class = np.argmax(y_pred, axis=1)
#predict_class = np.argmax(y_pred, axis=1)
#predict_class = predict_class.tolist()
#np.argmax(Y_test2, axis=1)
#predict_class

array([2, 2, 2, ..., 2, 2, 2], dtype=int64)

In [64]:

print(classification_report(some, sentiment_label2[0]))

              precision    recall  f1-score   support

           0       0.98      0.82      0.89    112417
           1       0.01      0.13      0.03      2372

    accuracy                           0.80    114789
   macro avg       0.50      0.47      0.46    114789
weighted avg       0.96      0.80      0.87    114789

