In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing the libraries**

In [None]:
#GPU Libraries
import cudf as pd
import cupy as cp
import cuml
from cuml.linear_model import LogisticRegression
from cuml.ensemble import RandomForestClassifier as cuRFC
from cuml.naive_bayes import MultinomialNB
from cuml.svm import SVC
from cuml.linear_model import LogisticRegression
from cuml.multiclass import MulticlassClassifier
from cuml.multiclass import OneVsRestClassifier


#Graph Plotting and Visualisation Library
import matplotlib.pyplot as plt
import seaborn as sns


#Text Preprocessing Libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import re
import string

#Deep Learning Libraries
import keras
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Activation
from keras.layers import Embedding
from keras.layers import Bidirectional
from tensorflow.keras.optimizers import Adam
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.callbacks import EarlyStopping
from keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from keras.models import Sequential




**Reading the dataset**

In [None]:
train_df=pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip',sep='\t')
test_df=pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip',sep='\t')

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.shape

In [None]:
test_df.head()

In [None]:
test_df.info()

In [None]:
test_df.describe()

In [None]:
test_df.shape

**DATA PREPROCESSING**

In [None]:
#Checking the sum of null values in the train and test dataset
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
train_df.isnull().any().any()

In [None]:
test_df.isnull().any().any()

There are no null values in the dataset

The sentiment labels are:

0 - negative

1 - somewhat negative

2 - neutral

3 - somewhat positive

4 - positive

In [None]:
train_df['Sentiment'].value_counts()

We see that label-2 which is the neutral label, has the highest number of examples

In [None]:
train=train_df.to_pandas()
sns.countplot(x ='Sentiment', data = train)

In [None]:
train_df['Phrase'][0]

**Text Preprocessing**

Steps-

1) Remove Punctuations

2) Remove HTML Tags

3) Convert text to lowercase

4) Remove all special characters 

5) Remove stopwords

6) Perform Stemming

7) Join back after stemming

In [None]:
sns.heatmap(train.isnull(),cmap='plasma')

In [None]:
test=test_df.to_pandas()

In [None]:
sns.heatmap(test.isnull(),cmap='plasma')

In [None]:
import string
import re
string.punctuation

In [None]:
#removing punctuations
train['Phrase']=train['Phrase'].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)).lower())
#removing punctuations
test['Phrase']=test['Phrase'].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)).lower())

In [None]:
#remove html tags
def clean_html(text):
    clean=re.compile('<.*?>')
    return re.sub(clean,'',text)

In [None]:
train['Phrase']=train['Phrase'].apply(clean_html)
test['Phrase']=test['Phrase'].apply(clean_html)

In [None]:
#convert text to lowercase
def convert_lower(text):
    return text.lower()

In [None]:
train['Phrase']=train['Phrase'].apply(convert_lower)
test['Phrase']=test['Phrase'].apply(convert_lower)

In [None]:
#remove special characters
def remove_special(text):
    x=''
    
    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x=x+' '
    return x

In [None]:
train['Phrase']=train['Phrase'].apply(remove_special)
test['Phrase']=test['Phrase'].apply(remove_special)

In [None]:
stopwords.words('english')

In [None]:
#remove stopwords
def remove_stopwords(text):
    x=[]
    for i in text.split():
        
        if i not in stopwords.words('english'):
            x.append(i)
    
    y=x[:]
    x.clear()
    return y

In [None]:
train['Phrase']=train['Phrase'].apply(remove_stopwords)
test['Phrase']=test['Phrase'].apply(remove_stopwords)

In [None]:
ps=PorterStemmer()

In [None]:
y=[]
#stem the words
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z

In [None]:
train['Phrase']=train['Phrase'].apply(stem_words)
test['Phrase']=test['Phrase'].apply(stem_words)

In [None]:
#join the stem words together
def join_back(list_input):
    return " ".join(list_input)

In [None]:
train['Phrase']=train['Phrase'].apply(join_back)
test['Phrase']=test['Phrase'].apply(join_back)

In [None]:
X_train =train['Phrase']
y_train = train['Sentiment']
tokenize = Tokenizer()
tokenize.fit_on_texts(X_train.values)

In [None]:
X_test = test['Phrase']
X_train = tokenize.texts_to_sequences(X_train)
X_test = tokenize.texts_to_sequences(X_test)

In [None]:
#add padding using pre-method
max_sequence_len = max([len(s.split()) for s in train['Phrase']])
X_train = pad_sequences(X_train, max_sequence_len,padding='pre')
X_test = pad_sequences(X_test, max_sequence_len,padding='pre')

In [None]:
X_train

In [None]:
print(X_train.shape)
print(X_test.shape)

# DEEP LEARNING MODELS

**1)LSTM MODEL**

In [None]:
# #LSTM Model
# embedding_dimension = 100
# input_val = len(tokenize.word_index)+1
# model_lstm = Sequential()
# model_lstm.add(Embedding(input_val, embedding_dimension, input_length=max_sequence_len))
# model_lstm.add(LSTM(units=256, dropout=0.3, recurrent_dropout=0.2 , return_sequences=True))
# model_lstm.add(LSTM(units=256, dropout=0.3, recurrent_dropout=0.2 ))
# model_lstm.add(Dense(5, activation='softmax'))# 5 as we are required to predict 5 labels-0,1,2,3,4


In [None]:
# model_lstm.summary()

In [None]:
# early_stopping = EarlyStopping(min_delta = 0.001,
#                                mode = 'max',
#                                monitor = 'val_acc',
#                                patience = 2)
# callback = [early_stopping]

In [None]:
# #Compiling the model
# model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# #Fit the model
# history_lstm=model_lstm.fit(X_train, y_train,batch_size=128, epochs=10, verbose=1,callbacks=callback)

In [None]:
# #Saving the model
# model_lstm.save('LSTM_Model.h5')

In [None]:
# loss_train = history_lstm.history['loss']
# epochs = range(1,11)
# plt.plot(epochs, loss_train, 'g', label='Training loss')
# plt.title('Training loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.show()

In [None]:
# accuracy_train = history_lstm.history['accuracy']
# epochs = range(1,11)
# plt.plot(epochs, accuracy_train, 'g', label='Training Accuracy')
# plt.title('Training Accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend()
# plt.show()

In [None]:
# #Predictions on the model
# predict_x=model_lstm.predict(X_test) 
# classes_x=np.argmax(predict_x,axis=1)

In [None]:
# classes_x

In [None]:
# #Submission of all the predictions
# submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
# submission_file['Sentiment']=classes_x
# submission_file.to_csv('Submission_LSTM.csv',index=False)

**2)ANN MODEL**

In [None]:
# import tensorflow as tf
# embedding_dimension = 100
# input_val = len(tokenize.word_index)+1
# model_ANN = tf.keras.Sequential([
#     tf.keras.layers.Embedding(input_val, embedding_dimension, input_length=max_sequence_len),
#     tf.keras.layers.Flatten(),
#     tf.keras.layers.Dense(256, activation='relu'),
#     tf.keras.layers.Dense(5, activation='softmax')
# ])

In [None]:
# model_ANN.summary()

In [None]:
# model_ANN.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# history_ANN=model_ANN.fit(X_train, y_train,batch_size=512, epochs=100, verbose=1,callbacks=callback)

In [None]:
# model_ANN.save('ANN_Model.h5')

In [None]:
# loss_train_ANN = history_ANN.history['loss']
# epochs = range(1,101)
# plt.plot(epochs, loss_train_ANN, 'g', label='Training loss')
# plt.title('Training loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.show()

In [None]:
# accuracy_train_ANN = history_ANN.history['accuracy']
# epochs = range(1,101)
# plt.plot(epochs, accuracy_train_ANN, 'g', label='Training Accuracy')
# plt.title('Training Accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend()
# plt.show()

In [None]:
# predict_x=model_ANN.predict(X_test) 
# classes_x_ANN=np.argmax(predict_x,axis=1)

In [None]:
# classes_x_ANN

In [None]:
# submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
# submission_file['Sentiment']=classes_x_ANN
# submission_file.to_csv('Submission_ANN.csv',index=False)

**3)CNN MODEL**

In [None]:
# import tensorflow as tf
# embedding_dimension = 100
# input_val = len(tokenize.word_index)+1
# model_CNN = tf.keras.Sequential([
#     tf.keras.layers.Embedding(input_val, embedding_dimension, input_length=max_sequence_len),
#     tf.keras.layers.Conv1D(128, 2, padding='same',activation='relu'),
#     tf.keras.layers.Dropout(0.2),
#     tf.keras.layers.Conv1D(64, 2, padding='same',activation='relu'),
#     tf.keras.layers.Dropout(0.2),
#     tf.keras.layers.MaxPooling1D(pool_size=2),
#     tf.keras.layers.Flatten(),
#     tf.keras.layers.Dense(256, activation='relu'),
#     tf.keras.layers.Dropout(0.3),
#     tf.keras.layers.Dense(128, activation='relu'),
#     tf.keras.layers.Dropout(0.3),
#     tf.keras.layers.Dense(5, activation='softmax')
# ])

In [None]:
# model_CNN.summary()

In [None]:
# model_CNN.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# history_CNN=model_CNN.fit(X_train, y_train,batch_size=512, epochs=100, verbose=1,callbacks=callback)

In [None]:
# model_CNN.save('CNN_Model.h5')

In [None]:
# loss_train_CNN = history_CNN.history['loss']
# epochs = range(1,101)
# plt.plot(epochs, loss_train_CNN, 'g', label='Training loss')
# plt.title('Training loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.show()

In [None]:
# accuracy_train_CNN = history_CNN.history['accuracy']
# epochs = range(1,101)
# plt.plot(epochs, accuracy_train_CNN, 'g', label='Training Accuracy')
# plt.title('Training Accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend()
# plt.show()

In [None]:
# predict_x_CNN=model_CNN.predict(X_test) 
# classes_x_CNN=np.argmax(predict_x_CNN,axis=1)

In [None]:
# cp.unique(classes_x_CNN)

In [None]:
# classes_x_CNN

In [None]:
# submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
# submission_file['Sentiment']=classes_x_CNN
# submission_file.to_csv('Submission_CNN.csv',index=False)

**4)BI-LSTM MODEL**

In [None]:
# import tensorflow as tf
# embedding_dimension = 100
# input_val = len(tokenize.word_index)+1
# model_biLSTM = tf.keras.Sequential([
#     tf.keras.layers.Embedding(input_val, embedding_dimension, input_length=max_sequence_len),
#      tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
#     tf.keras.layers.Dropout(0.5),
#     tf.keras.layers.Dense(256, activation='relu'),
#     tf.keras.layers.Dropout(0.5),
#     tf.keras.layers.Dense(5, activation='softmax')
# ])

In [None]:
# model_biLSTM.summary()

In [None]:
# model_biLSTM.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# history_biLSTM=model_biLSTM.fit(X_train, y_train,batch_size=512, epochs=50, verbose=1,callbacks=callback)

In [None]:
# model_biLSTM.save('biLSTM_Model.h5')

In [None]:
# loss_train_biLSTM = history_biLSTM.history['loss']
# epochs = range(1,51)
# plt.plot(epochs, loss_train_biLSTM, 'g', label='Training loss')
# plt.title('Training loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.show()

In [None]:
# accuracy_train_biLSTM = history_biLSTM.history['accuracy']
# epochs = range(1,51)
# plt.plot(epochs, accuracy_train_biLSTM, 'g', label='Training Accuracy')
# plt.title('Training Accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend()
# plt.show()

In [None]:
# predict_x_biLSTM=model_biLSTM.predict(X_test) 
# classes_x_biLSTM=np.argmax(predict_x_biLSTM,axis=1)

In [None]:
# classes_x_biLSTM

In [None]:
# submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
# submission_file['Sentiment']=classes_x_biLSTM
# submission_file.to_csv('Submission_biLSTM.csv',index=False)

# MACHINE LEARNING MODELS-CUML

**1) Logistic Regression**

In [None]:
# #Calling the model
# log_reg = OneVsRestClassifier(LogisticRegression())

In [None]:
# #Fitting the model
# log_reg.fit(X_train.astype('float32'),y_train.astype('float32'))

In [None]:
# #Making Predictions
# y_pred_log_reg = log_reg.predict(X_test.astype('float32'))

In [None]:
# y_pred_log_reg=y_pred_log_reg.astype('int32')

In [None]:
# y_pred_log_reg

In [None]:
# cp.unique(y_pred_log_reg)

In [None]:
# #Accurcy score
# cu_score_log_reg = cuml.metrics.accuracy_score(y_train, y_pred_log_reg )

In [None]:
# cu_score_log_reg

In [None]:
# #Creating a submission
# submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
# submission_file['Sentiment']=y_pred_log_reg
# submission_file.to_csv('Submission_LogisticRegression.csv',index=False)

**2)Random Forest**

In [None]:
# cuml_model = cuRFC(max_features=1.0,
#                    n_bins=8,
#                    n_estimators=40)

In [None]:
# cuml_model.fit(X_train.astype('float32'),y_train.astype('float32'))

In [None]:
# y_pred_random_forest_classifier = cuml_model.predict(X_test.astype('float32'))

In [None]:
# y_pred_random_forest_classifier=y_pred_random_forest_classifier.astype('int32')

In [None]:
# y_pred_random_forest_classifier

In [None]:
# cp.unique(y_pred_random_forest_classifier)

In [None]:
# cu_score_random_forest = cuml.metrics.accuracy_score(y_train, y_pred_random_forest_classifier )

In [None]:
# cu_score_random_forest

In [None]:
# submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
# submission_file['Sentiment']=y_pred_random_forest_classifier
# submission_file.to_csv('Submission_RandomForest.csv',index=False)

**3)Naive Bayes**

In [None]:
# nb = MultinomialNB()
# nb.fit(X_train, y_train)

In [None]:
# nb_predict=nb.predict(X_test)

In [None]:
# nb_predict

In [None]:
# cp.unique(nb_predict)

In [None]:
# cu_score_naive_bayes = cuml.metrics.accuracy_score(y_train, nb_predict )

In [None]:
# cu_score_naive_bayes

In [None]:
# submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
# submission_file['Sentiment']=nb_predict
# submission_file.to_csv('Submission_NaiveBayes.csv',index=False)

**4)KNeighbors Classifier**

Runtime error in KNN

In [None]:
# from cuml.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier(n_neighbors=3)
# knn.fit(X_train,y_train)

In [None]:

# knn_predict=knn.predict(X_test)

In [None]:
# knn_predict

In [None]:
# cp.unique(knn_predict)

In [None]:
# cu_score_knn = cuml.metrics.accuracy_score(y_train, knn_predict )

In [None]:
# cu_score_knn

In [None]:
# submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
# submission_file['Sentiment']=knn_predict
# submission_file.to_csv('Submission_KNN.csv',index=False)

**5) SVM Classifier**

SVM was taking a very long time to run and was consuming entire RAM, so commented it

In [None]:
# svc_gpu = SVC(kernel='poly', degree=2, gamma='auto', C=1)
# svc_gpu.fit(X_train, y_train)

In [None]:
# svc_gpu_predict=svc_gpu.predict(X_test)

In [None]:
# submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
# submission_file['Sentiment']=svc_gpu_predict
# submission_file.to_csv('Submission_SVM.csv',index=False)

# MACHINE LEARNING MODELS-SKLEARNB

**1)Logistic Regression**

****

In [None]:
#Machine Learning-sklearn libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,roc_curve,auc,f1_score

In [None]:
# lr_sklearn = LogisticRegression(random_state=0)

In [None]:
# lr_sklearn.fit(X_train,y_train)

In [None]:
# pred_lr_sklearn=lr_sklearn.predict(X_test)

In [None]:
# pred_lr_sklearn

In [None]:
# np.unique(pred_lr_sklearn)

In [None]:
# submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
# submission_file['Sentiment']=pred_lr_sklearn
# submission_file.to_csv('Submission_LogisticRegression_sklearn.csv',index=False)

**2)Random Forest**

In [None]:
rf_sklearn = RandomForestClassifier(n_estimators=1000,random_state=42,criterion='gini', n_jobs = -1)

In [None]:
rf_sklearn.fit(X_train,y_train)

In [None]:
rf_sklearn_predict=rf_sklearn.predict(X_test)

In [None]:
rf_sklearn_predict

In [None]:
np.unique(rf_sklearn_predict)

In [None]:
submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
submission_file['Sentiment']=rf_sklearn_predict
submission_file.to_csv('Submission_RandomForest_sklearn.csv',index=False)

**3)Naive Bayes**

In [None]:
# nb_sklearn=MultinomialNB()

In [None]:
# nb_sklearn.fit(X_train,y_train)

In [None]:
# nb_sklearn_predict=nb_sklearn.predict(X_test)

In [None]:
# nb_sklearn_predict

In [None]:
# submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
# submission_file['Sentiment']=nb_sklearn_predict
# submission_file.to_csv('Submission_NaiveBayes_sklearn.csv',index=False)

**4)KNeighbors Classifier**

In [None]:
# knn_sklearn = KNeighborsClassifier(n_neighbors=3)

In [None]:
# knn_sklearn.fit(X_train,y_train)

In [None]:
# knn_sklearn_predict=knn_sklearn.predict(X_test)

In [None]:
# knn_sklearn_predict

In [None]:
# submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
# submission_file['Sentiment']=knn_sklearn_predict
# submission_file.to_csv('Submission_KNN_sklearn.csv',index=False)

**5)SVM CLASSIFIER**

SVM was taking a very long time to run and was consuming entire RAM, so commented it

In [None]:
# svc = svm.SVC(decision_function_shape='ovo')
# svc.fit(X_train, y_train)

In [None]:
# svm_pred=svc.predict(X_test)

In [None]:
# submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
# submission_file['Sentiment']=knn_sklearn_predict
# submission_file.to_csv('Submission_KNN_sklearn.csv',index=False)

**6)XGBOOST CLASSIFIER**

In [None]:
# xgb = XGBClassifier(silent=False, 
#                       scale_pos_weight=1,
#                       learning_rate=0.01,  
#                       colsample_bytree = 0.4,
#                       subsample = 0.8,
#                       n_estimators=1000, 
#                       reg_alpha = 0.3,
#                       max_depth=4, 
#                       gamma=10)

In [None]:
# xgb.fit(X_train,y_train)

In [None]:
# xgb_predict=xgb.predict(X_test)

In [None]:
# xgb_predict

In [None]:
# np.unique(xgb_predict)

In [None]:
# submission_file =pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv',sep=',')
# submission_file['Sentiment']=xgb_predict
# submission_file.to_csv('Submission_XGB.csv',index=False)