In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
print(os.listdir('../input/movie-review-sentiment-analysis-kernels-only/'))

In [None]:
train_dir = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/train.tsv.zip',sep='\t')
test_dir = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/test.tsv.zip',sep='\t')

In [None]:
train_dir.head()

In [None]:
train_dir['Sentiment'].unique()

In [None]:
plt.style.use('seaborn')
plt.figure(figsize=(7,5))
sns.countplot(data=train_dir,x='Sentiment')

In [None]:
print(len(train_dir))
print(len(test_dir))

In [None]:
test_dir.head()

In [None]:
train_dir

In [None]:
train_dir.isna().sum()

In [None]:
X = train_dir.drop('Sentiment',axis=1)
X

In [None]:
y = train_dir['Sentiment']
y

In [None]:
import nltk
import re
from nltk.stem import WordNetLemmatizer

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:

corpus = []

for i in range(len(X)):
    text = re.sub('[^a-zA-Z]',' ',X['Phrase'][i])
    text = text.lower()
    text = text.split()
    
    text = [lemmatizer.lemmatize(word) for word in text if not word in nltk.corpus.stopwords.words('english')]
    text = ' '.join(text)
    corpus.append(text)

In [None]:
corpus

In [None]:
import keras 
from keras.utils import to_categorical

In [None]:
y = to_categorical(y)
y

In [None]:
ttest_dir = test_dir.drop('PhraseId',axis=1,inplace=True)

In [None]:
test_dir

In [None]:
test_corpus = []

for i in range(len(test_dir)):
    text = re.sub('[^a-zA-Z]',' ',test_dir['Phrase'][i])
    text = text.lower()
    text = text.split()
    
    text = [lemmatizer.lemmatize(word) for word in text if not word in nltk.corpus.stopwords.words('english')]
    text = ' '.join(text)
    test_corpus.append(text)

In [None]:
test_corpus

In [None]:
word2count = {}

for sentence in corpus:
    words = nltk.word_tokenize(sentence)
    
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

In [None]:
len(word2count)

In [None]:
import heapq

In [None]:
word_freq = heapq.nlargest(5000,word2count,key=word2count.get)

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,Dense,LSTM,Dropout
from keras.preprocessing.text import one_hot

In [None]:
vocab_size = len(word_freq)
one_hot_train = []
for sentences in corpus:
    Z = one_hot(sentences,vocab_size)
    one_hot_train.append(Z)

In [None]:
one_hot_train[:5]

In [None]:
one_hot_test = []
for sentences in test_corpus:
    Z = one_hot(sentences,vocab_size)
    one_hot_test.append(Z)

In [None]:
one_hot_test[:2]

In [None]:
length = 20
train_embedded_sents = pad_sequences(one_hot_train,padding='pre',maxlen=length)

In [None]:
test_embedded_sents = pad_sequences(one_hot_test,padding='pre',maxlen=length)

In [None]:
train_embedded_sents[:2]

In [None]:
test_embedded_sents[:2]

In [None]:
embedding_feature_vectors = 40
model = Sequential()
model.add(Embedding(vocab_size,embedding_feature_vectors,input_length=length))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(5,activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
X_final = np.asarray(train_embedded_sents)
y_final = np.asarray(y)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_valid,y_train,y_valid = train_test_split(X_final,y_final,test_size=0.2)

In [None]:
len(X_train)

In [None]:
len(X_valid)

In [None]:
history = model.fit(X_train,y_train,validation_data=(X_valid,y_valid),epochs=20,batch_size=128)

In [None]:
plt.style.use('dark_background')
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
sns.lineplot(x=np.arange(1, 21), y=history.history.get('loss'), ax=ax[0, 0])
sns.lineplot(x=np.arange(1, 21), y=history.history.get('accuracy'), ax=ax[0, 1])
sns.lineplot(x=np.arange(1, 21), y=history.history.get('val_loss'), ax=ax[1, 0])
sns.lineplot(x=np.arange(1, 21), y=history.history.get('val_accuracy'), ax=ax[1, 1])
ax[0, 0].set_title('Training Loss vs Epochs')
ax[0, 1].set_title('Training Accuracy vs Epochs')
ax[1, 0].set_title('Validation Loss vs Epochs')
ax[1, 1].set_title('Validation Accuracy vs Epochs')
plt.show()

In [None]:
test = np.asarray(test_embedded_sents)
test

In [None]:
sub = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/sampleSubmission.csv')

In [None]:
sub['labels'] = model.predict_classes(test,batch_size=128)
sub

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
score = accuracy_score(sub['Sentiment'],sub['labels'])
print(score)