In [None]:
# Install to have latest seaborn version to use histplot
!pip install seaborn==0.11.0

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Lemmatization
nltk.download('wordnet')

### Functions Defined

## 1) Load Data

In [None]:
def load_data(path,file):
    data = pd.read_csv(path+file)
    return data

In [None]:
train_path = '/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/'
train_file = 'Train.csv'
train_data = load_data(train_path,train_file)

valid_path = '/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/'
valid_file = 'Valid.csv'
valid_data = load_data(valid_path,valid_file)

test_path = '/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/'
test_file = 'Test.csv'
test_data = load_data(test_path,test_file)

In [None]:
train_data.head(10)

## 2) Data Exploration

In [None]:
# Statistics and details
print('Columns:',train_data.columns)
print('Shape:', train_data.shape)
print('Stats:',train_data.describe(include='object'))
print('Class Distribution:',train_data['label'].value_counts())
print('Info:',train_data.info())

In [None]:
#Class distribution
plt.figure(figsize=(12,6))
sns.countplot(x='label',data=train_data)

In [None]:
#Get word count
train_data['word_count']=train_data['text'].str.lower().str.len()
train_data

In [None]:
plt.figure(figsize=(12,6))
sns.kdeplot(train_data['word_count'],shade=True,color='r').set_title('Kernel Distribution of Number of words')

In [None]:
#sns.histplot(train_data['word_count'],color='r')
print(sns.__version__)
sns.histplot(data=train_data, x="word_count").set_title('Word Count Distribution')

In [None]:
positive_wordcnt =  train_data['word_count'][train_data['label']==0]
pos_plot = sns.kdeplot(positive_wordcnt,color='b',shade=True)
negative_wordcnt = train_data['word_count'][train_data['label']==1]
neg_plot = sns.kdeplot(negative_wordcnt,color='r',shade=True)

In [None]:
print(string.punctuation)

In [None]:
# Remove stopwords
def preprocess_text(data):
    stop = stopwords.words('english')
    punct = '''!"#$%&'()*+,-/:;<=>?@[\]^_`{|}~'''
    #print(stop)
    #Make lower
    data['text'] = data['text'].str.lower()
    #Remove stopwords
    data['text'] = data['text'].apply(lambda x:' '.join([words for words in x.split() if words not in stop]))
    #Remove punctuations
    data['text'] = data['text'].str.translate(str.maketrans('', '', punct))
    data['word_count'] = data['text'].str.split().str.len()
    return data
train_data = preprocess_text(train_data)
train_data.head()

In [None]:
positive_wordcnt =  train_data['word_count'][train_data['label']==0]
pos_plot = sns.kdeplot(positive_wordcnt,color='b',shade=True)
negative_wordcnt = train_data['word_count'][train_data['label']==1]
neg_plot = sns.kdeplot(negative_wordcnt,color='r',shade=True)

In [None]:
def pos_tag(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag,wordnet.NOUN)

In [None]:
print(train_data.head())
print(train_data.shape)

In [None]:
def lemmatize(data):
    lemmatizer=WordNetLemmatizer()
    data['text'] = data['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w,pos_tag(w)) for w in word_tokenize(x)]))
    return data

In [None]:
## Normalization;Lemmatization; No change as no pos_tag is there;hence add pos_tag
train_data = lemmatize(train_data)
train_data

In [None]:
max_count = max(train_data['word_count'])
print(max_count)

In [None]:
# view one lemmatized record
train_data['text'][0]

In [None]:
## Plotting word cloud
def plot_cloud(wordcloud):
    plt.figure(figsize=(40,30))
    plt.imshow(wordcloud)

In [None]:
#get postive and negative sentiment records for wordcloud
positive = train_data['text'][train_data['label']==1]
negative = train_data['text'][train_data['label']==0]
stop = set(stopwords.words('english'))
stop.update(["br", "href","film","movie","one"])
print(stop)
#negative
## Wordcloud
pos_wordcloud = WordCloud(stopwords=stop).generate(' '.join(positive))
neg_wordcloud = WordCloud(stopwords=stop).generate(' '.join(negative))
# pos_wordcloud = WordCloud(stopwords=stop,width=800,height=800,min_font_size=10).generate(' '.join(positive))
# neg_wordcloud = WordCloud(stopwords=stop,width=800,height=800,min_font_size=10).generate(' '.join(negative))


In [None]:
plot_cloud(pos_wordcloud)

In [None]:
plot_cloud(neg_wordcloud)

In [None]:
#Remove word_count column
train_data = train_data[['text','label']]
train_data

### 3) Do pre-processing for test and valid dataset

In [None]:
valid_data = preprocess_text(valid_data)
valid_data = lemmatize(valid_data)

test_data = preprocess_text(test_data)
test_data = lemmatize(test_data)
test_data

In [None]:
x_valid_data = valid_data[['text']]
y_valid_data = valid_data[['label']]

x_test_data = test_data[['text']]
y_test_data = test_data[['label']]

### 3) Building model

In [None]:
# Word technique
#Create feature vectors using Bag of Words-TfIdf
tfidf_converter = TfidfVectorizer(max_features=1000,min_df=5,max_df=0.7)
x = tfidf_converter.fit_transform(train_data['text']).toarray()
y = train_data['label']

In [None]:
#bi-grams
# Word technique
#Create feature vectors using Bag of Words-TfIdf
ntfidf_converter = TfidfVectorizer(max_features=1000,min_df=5,max_df=0.7,ngram_range=(2,2))
nx = ntfidf_converter.fit_transform(train_data['text']).toarray()
ny = train_data['label']

In [None]:
print(x.shape)
print(y)

In [None]:
y_valid=np.array(y_valid_data).reshape(-1,1)
print(y_valid)

In [None]:
# Choose model
#Naive Bayes Algorithm p(sent|word) = p(sent)p(word|sent)/p(word)
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
gnb = GaussianNB()
gnb.fit(x,y)


ngram_gnb = GaussianNB()
ngram_gnb.fit(nx,ny)


In [None]:
#Save model
import pickle
gnb_model = pickle.dumps(gnb)

ngram_gnb_model = pickle.dumps(ngram_gnb)

In [None]:
# Use tfidf to transform test and valid data
x_val_data = tfidf_converter.transform(x_valid_data['text']).toarray()
x_tst_data = tfidf_converter.transform(x_test_data['text']).toarray()

In [None]:
#ngram
# Use tfidf to transform test and valid data
nx_valid_data = ntfidf_converter.transform(x_valid_data['text']).toarray()
nx_test_data = ntfidf_converter.transform(x_test_data['text']).toarray()

In [None]:
#Load the model
gnb_model = pickle.loads(gnb_model)

ngram_gnb_model = pickle.loads(ngram_gnb_model)

In [None]:
# Predict valid
ypred_valid = gnb_model.predict(x_val_data)
#print(ypred_valid)
print('Valid Accuracy:',accuracy_score(y_valid_data,ypred_valid))

#Predict Test
ypred_test = gnb_model.predict(x_tst_data)
print('Test Accuracy:',accuracy_score(y_test_data,ypred_test))
print(confusion_matrix(y_test_data,ypred_test))
print('Classification Report:',classification_report(y_test_data,ypred_test))

In [None]:
#ngram
# Predict valid
nypred_valid = ngram_gnb_model.predict(nx_valid_data)
#print(ypred_valid)
print('Valid Accuracy:',accuracy_score(y_valid_data,nypred_valid))

#Predict Test
nypred_test = ngram_gnb_model.predict(nx_test_data)
print('Test Accuracy:',accuracy_score(y_test_data,nypred_test))
print(confusion_matrix(y_test_data,ypred_test))
print('Classification Report:',classification_report(y_test_data,nypred_test))

In [None]:
#Random text prediction
text = ['that is too bad.but i can assure it can be made better','not sure if the climax really did well','as expected']
text = tfidf_converter.transform(text).toarray()
print(gnb_model.predict(text))

In [None]:
#Random text prediction
text = ['that is not bad.but i can assure it can be made better','not sure if the climax really did well','as expected']
text = ntfidf_converter.transform(text).toarray()
print(ngram_gnb_model.predict(text))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=4, random_state=0)
rf.fit(x,y)

In [None]:
rf_model=pickle.dumps(rf)
# Use tfidf to transform test and valid data
x_rf_valid_data = tfidf_converter.transform(x_valid_data['text']).toarray()
x_rf_test_data = tfidf_converter.transform(x_test_data['text']).toarray()


In [None]:
#Load the model
rf_model = pickle.loads(rf_model)

In [None]:
# Predict valid
ypred_valid = rf_model.predict(x_rf_valid_data)
#print(ypred_valid)
print('RF Valid Accuracy:',accuracy_score(y_valid_data,ypred_valid))

#Predict Test
ypred_test = rf_model.predict(x_rf_test_data)
print('RF Test Accuracy:',accuracy_score(y_test_data,ypred_test))

In [None]:
#Random text prediction
text = ['that is too bad.but i can assure it can be made better','not sure if the climax really diid well']
text = tfidf_converter.transform(text).toarray()
print(rf_model.predict(text))

## Boosting

In [None]:
#Boosting
import sklearn
print(sklearn.__version__)
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
ada.fit(x,y)

In [None]:
# Use tfidf to transform test and valid data
x_val_data = tfidf_converter.transform(x_valid_data['text']).toarray()
x_tst_data = tfidf_converter.transform(x_test_data['text']).toarray()

In [None]:
#Save model
ada_model = pickle.dumps(ada)


In [None]:
#Predict valid and test data
ada_model = pickle.loads(ada_model)
y_ada_val_pred = ada_model.predict(x_val_data)
print("Valid data Accuracy is :",accuracy_score(y_valid_data,y_ada_val_pred))

#Predict Test
y_ada_tst_pred = ada_model.predict(x_tst_data)
print('Test Accuracy:',accuracy_score(y_test_data,y_ada_tst_pred))
print(confusion_matrix(y_test_data,y_ada_tst_pred))
print('Classification Report:',classification_report(y_test_data,y_ada_tst_pred))

In [None]:
#Random Text Prediction
text = ['not good','worth watching','bad','grew up listening to this.awful']
text = tfidf_converter.transform(text).toarray()
print(ada_model.predict(text))

In [None]:
#Random text prediction
text = ['that is too bad.but i can assure it can be made better','not sure if the climax really did well','as expected']
text = tfidf_converter.transform(text).toarray()
print(ada_model.predict(text))