# This notebook shows how to implement different Machine learning algorithms to do sentiment analyses.

# Please Upvote this notebook if you find it useful

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data loading

In [None]:
train_df = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/train.tsv.zip', delimiter='\t')
test_df = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/test.tsv.zip', delimiter='\t')

# Data Analysis

In [None]:
train_df.info()

In [None]:
train_df.shape

In [None]:
test_df.info()

In [None]:
train_df.head()


In [None]:
train_df.Phrase[0]

In [None]:
train_df.Phrase[1]

In [None]:
train_df['SentenceId'].unique()


In [None]:
test_df.head()

In [None]:
test_df.Phrase[0]

In [None]:
test_df.Phrase[1]

In [None]:
train_df.isnull().sum()

# Data visualization 

In [None]:
from collections import Counter
import matplotlib.pyplot as plt 

target_cnt = Counter(train_df.Sentiment)

plt.figure(figsize=(16,8))
plt.bar(target_cnt.keys(), target_cnt.values())
plt.title("Dataset labels distribuition")
plt.show()

In [None]:
target_Sen = Counter(train_df.SentenceId)

plt.figure(figsize=(16,8))
plt.bar(target_Sen.keys(), target_Sen.values())
plt.title("Dataset SentenceId distribuition")
plt.show()

In [None]:
target_Sen = Counter(train_df.PhraseId)

plt.figure(figsize=(16,8))
plt.bar(target_Sen.keys(), target_Sen.values())
plt.title("Dataset PhraseId distribuition")
plt.show()

In [None]:
df_EDA1 = train_df[train_df['Sentiment']==0]

In [None]:
df_EDA1

In [None]:
target_Sen = Counter(df_EDA1.SentenceId)

plt.figure(figsize=(16,8))
plt.bar(target_Sen.keys(), target_Sen.values())
plt.title("Dataset SentenceId distribuition")
plt.show()

In [None]:
print(df_EDA1.SentenceId.value_counts().shape)
print(train_df.SentenceId.value_counts().shape)
print(df_EDA1.SentenceId.value_counts().shape[0]/train_df.SentenceId.value_counts().shape[0])

In [None]:
df_EDA1.SentenceId.value_counts()

In [None]:
df_EDA2 = train_df[train_df['SentenceId']==3189]
df_EDA2

In [None]:
target_cnt = Counter(df_EDA2.Sentiment)

plt.figure(figsize=(16,8))
plt.bar(target_cnt.keys(), target_cnt.values())
plt.title("Dataset labels distribuition of SentenceId 3189")
plt.show()

# The graph shows that most of the sentiments in SentenceId 3189 are negative but most of the sentiment in the full data set is natural.
so the SentenceId can be used as a feature to improve the results.

# Let's combined SentenceId and Phrase data.

# EDA

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# **wordcloud of Very negative review**

In [None]:
plt.figure(figsize=(20,20))
wc = WordCloud(max_words= 500, width= 1600, height= 800, stopwords= stop_words).generate(" ".join(train_df[train_df.Sentiment==0].Phrase))
plt.imshow(wc, interpolation= 'bilinear')
plt.show()

# wordcloud of negative review

In [None]:
plt.figure(figsize=(20,20))
wc = WordCloud(max_words= 500, width= 1600, height= 800, stopwords= stop_words).generate(" ".join(train_df[train_df.Sentiment==1].Phrase))
plt.imshow(wc, interpolation= 'bilinear')
plt.show()

# wordcloud of Very natural review

In [None]:
plt.figure(figsize=(20,20))
wc = WordCloud(max_words= 500, width= 1600, height= 800, stopwords= stop_words).generate(" ".join(train_df[train_df.Sentiment==2].Phrase))
plt.imshow(wc, interpolation= 'bilinear')
plt.show()

# Wordcloud of Very postive review 

In [None]:
plt.figure(figsize=(20,20))
wc = WordCloud(max_words= 500, width= 1600, height= 800, stopwords= stop_words).generate(" ".join(train_df[train_df.Sentiment==3].Phrase))
plt.imshow(wc, interpolation= 'bilinear')
plt.show()

# wordcloud of Very very postive review

In [None]:
plt.figure(figsize=(20,20))
wc = WordCloud(max_words= 500, width= 1600, height= 800, stopwords= stop_words).generate(" ".join(train_df[train_df.Sentiment==4].Phrase))
plt.imshow(wc, interpolation= 'bilinear')
plt.show()

# EDA

# Base line for a model

In [None]:
base_line = len(train_df[train_df['Sentiment']==2])/len(train_df.Sentiment)
base_line

# Naive bayes moldes without data preprocesing

In [None]:
#importing 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
#split
x_train1,x_test1,y_train1,y_test1=train_test_split(train_df['Phrase'],train_df['Sentiment'],test_size=0.2,random_state=42)
#vectorizing
cv=CountVectorizer()
X_train1=cv.fit_transform(x_train1.values)
#model 
model1=MultinomialNB()
model1.fit(X_train1,y_train1)
test_count1=cv.transform(x_test1.values)
Model1_score =model1.score(test_count1,y_test1)
pred1 = model1.predict(test_count1)

print('Model1 score:' ,Model1_score) 

Naive Bayes is the simplest Machine learning model that can use to do classifications. Even without pre-processing the text, Naive Bayes got an accuracy of 61%

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from matplotlib import pyplot as plt
from matplotlib import ticker
import seaborn as sns
conf = confusion_matrix(y_test1, pred1)

cm = pd.DataFrame(
    conf, index = [i for i in ['0', '1', '2', '3', '4']],
    columns = [i for i in ['0', '1', '2', '3', '4']]
)

plt.figure(figsize = (12,7))
sns.heatmap(cm, annot=True, fmt="d")
plt.show()

# Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbrt = GradientBoostingClassifier(random_state =0)
gbrt.fit(X_train1,y_train1)
Model2_score =gbrt.score(test_count1,y_test1)

In [None]:
Model2_Accscore =gbrt.score(X_train1,y_train1)

In [None]:
Model2_Accscore

In [None]:
Model2_score

In [None]:
Model2_score
pred2 = gbrt.predict(test_count1)

In [None]:
conf = confusion_matrix(y_test1, pred2)

cm = pd.DataFrame(
    conf, index = [i for i in ['0', '1', '2', '3', '4']],
    columns = [i for i in ['0', '1', '2', '3', '4']]
)

plt.figure(figsize = (12,7))
sns.heatmap(cm, annot=True, fmt="d")
plt.show()

# SVM

In [None]:
from sklearn.svm import LinearSVC
linear_svm = LinearSVC(C=100).fit(X_train1,y_train1)
Model3_score =linear_svm.score(test_count1,y_test1)
pred3 = linear_svm.predict(test_count1)

print('Model13 score:' ,Model3_score)


Model traing Acc

In [None]:
Model3_Accscore =linear_svm.score(X_train1,y_train1)
Model3_Accscore

In [None]:
conf = confusion_matrix(y_test1, pred3)

cm = pd.DataFrame(
    conf, index = [i for i in ['0', '1', '2', '3', '4']],
    columns = [i for i in ['0', '1', '2', '3', '4']]
)

plt.figure(figsize = (12,7))
sns.heatmap(cm, annot=True, fmt="d")
plt.show()

In [None]:
from sklearn.svm import SVC
svm = SVC().fit(X_train1,y_train1)
Model4_score =svm.score(test_count1,y_test1)
pred4 = linear_svm.predict(test_count1)

print('Model14 score:' ,Model4_score)

# TensorFlow Model

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds 

In [None]:
indices = y_train1
depth = 5
y_train = tf.one_hot(indices, depth)

In [None]:
hub_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", input_shape=[], output_shape=[512,16], 
  dtype=tf.string,trainable= True)

In [None]:
model = tf.keras.models.Sequential([
  hub_layer,
    #tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dense(5, activation='softmax')
  ])
model.summary()
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics= ['acc','MAE'])
#model.fit(x_train1, y_train, batch_size= 64, validation_split = 0.2, epochs= 10)

In [None]:
#tf.config.experimental_run_functions_eagerly(True)

In [None]:
model.fit(x_train1, y_train, batch_size= 64, validation_split = 0.2, epochs= 10)

In [None]:
pred4 = model.predict_classes(x_test1)

In [None]:
conf = confusion_matrix(y_test1, pred4)

cm = pd.DataFrame(
    conf, index = [i for i in ['0', '1', '2', '3', '4']],
    columns = [i for i in ['0', '1', '2', '3', '4']]
)

plt.figure(figsize = (12,7))
sns.heatmap(cm, annot=True, fmt="d")
plt.show()

**This notebook shows how to implement Machine learning and deep learning models to do sentiment analysis even without any data preprocessing and hyperparameter tuning the accuracies of the models are reasonably good.**


# Let's see model performance after cleaning the data

In [None]:
from nltk.corpus import stopwords
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [None]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) >2 and token not in stop_words:
            result.append(token)
    return result

In [None]:
train_df['Phrase_lower'] = train_df['Phrase'].str.lower()
train_df['Phrase_text_new'] = train_df['Phrase_lower'].str.replace(r'[^A-Za-z0-9]+', ' ')
train_df['cleen_Phrase'] =train_df['Phrase_text_new'].apply(preprocess)
train_df['cleen_Phrase']= train_df['cleen_Phrase'].apply(lambda x: " ".join(x))

train_df.head()

In [None]:
train_df['cleen_Phrase'][2]

In [None]:
x_train2,x_test2,y_train2,y_test2=train_test_split(train_df['cleen_Phrase'],train_df['Sentiment'],test_size=0.2,random_state=42)

In [None]:
model1 = tf.keras.models.Sequential([
  hub_layer,
    #tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    
  tf.keras.layers.Dense(256, activation = tf.keras.layers.LeakyReLU(alpha=0.3)),
    tf.keras.layers.Dropout(0.3),
    
  tf.keras.layers.Dense(64, activation = tf.keras.layers.LeakyReLU(alpha=0.3)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
  tf.keras.layers.Dense(5, activation='softmax')
  ])
model1.summary()
model1.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics= ['acc','MAE'])
#model.fit(x_train1, y_train, batch_size= 64, validation_split = 0.2, epochs= 10)

In [None]:
indices = y_train2
depth = 5
Y_train = tf.one_hot(indices, depth)

In [None]:
model1.fit(x_train2, Y_train, batch_size= 64, validation_split = 0.2, epochs= 5)

In [None]:
pred5 = model1.predict_classes(x_test2)

In [None]:
conf = confusion_matrix(y_test2, pred5)

cm = pd.DataFrame(
    conf, index = [i for i in ['0', '1', '2', '3', '4']],
    columns = [i for i in ['0', '1', '2', '3', '4']]
)

plt.figure(figsize = (12,7))
sns.heatmap(cm, annot=True, fmt="d")
plt.show()

# I hope you find this notebook useful please upvote the notebook and give your comment.