In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gensim
import nltk

In [None]:
print(os.listdir("../input/embeddings/GoogleNews-vectors-negative300/"))
path = "../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"
embeddings = gensim.models.KeyedVectors.load_word2vec_format(path , binary = True)

In [None]:
url = 'https://raw.githubusercontent.com/skathirmani/datasets/master/hotstar.allreviews_Sentiments.csv'
df_hotstar = pd.read_csv(url)
df_hotstar['Sentiment_Manual'].head()


In [None]:
df_hotstar.head()

In [None]:
df_hotstar['Sentiment_Manual'].value_counts()

In [None]:
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt')

## Word Cloud

In [None]:
Neutral = df_hotstar[df_hotstar['Sentiment_Manual'] == 'Neutral']
Positive = df_hotstar[df_hotstar['Sentiment_Manual'] == 'Positive']
Negative = df_hotstar[df_hotstar['Sentiment_Manual'] == 'Negative']

In [None]:
Docs1 = Neutral['Lower_Case_Reviews']
print(len(Docs1))

Docs2 = Positive['Lower_Case_Reviews']
print(len(Docs2))

Docs3 = Negative['Lower_Case_Reviews']
print(len(Docs3))

In [None]:
! pip install wordcloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
StopWords = nltk.corpus.stopwords.words('english')

In [None]:
WC_Neutral = WordCloud(background_color = 'white' , stopwords = StopWords).generate('' . join(Docs1))
plt.imshow(WC_Neutral)

In [None]:
WC_Positive = WordCloud(background_color = 'white' , stopwords = StopWords).generate('' . join(Docs2))
plt.imshow(WC_Positive)

In [None]:
WC_Negative = WordCloud(background_color = 'white' , stopwords = StopWords).generate('' . join(Docs3))
plt.imshow(WC_Negative)

## Data Cleaning

In [None]:
Docs = df_hotstar['Lower_Case_Reviews']
Docs = Docs.str.replace('-' , ' ').str.lower().str.replace('[^a-z ]' , ' ')

In [None]:
Docs.head()

In [None]:
StopWords = nltk.corpus.stopwords.words('english')
clean_sentence = lambda doc: ' '.join([word for word in nltk.word_tokenize(doc) if word not in StopWords])
Docs_clean = Docs.apply(clean_sentence)
Docs_clean.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB , BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Train-test split

In [None]:
x_train , x_test , y_train , y_test = train_test_split(Docs_clean , df_hotstar['Sentiment_Manual'] , 
                                                       test_size = 0.2 , random_state = 100)

## Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vec = CountVectorizer(min_df = 5).fit(x_train)
x_train = vec.transform(x_train)
x_test = vec.transform(x_test)

## Multinomial Naive Bayes Classification

In [None]:
model_mnb = MultinomialNB().fit(x_train , y_train)
test_pred_mnb = model_mnb.predict(x_test)
print(accuracy_score(y_test , test_pred_mnb))

## Ada Boost Count Vectorizer

In [None]:
model_ab = AdaBoostClassifier(n_estimators = 100 , random_state = 99).fit(x_train , y_train)
test_pred_ab = model_ab.predict(x_test)
print(accuracy_score(y_test , test_pred_ab))

## Random Forest Count Vectorizer

In [None]:
model_rf = RandomForestClassifier(n_estimators = 100 , random_state = 99).fit(x_train , y_train)
test_pred_rf = model_rf.predict(x_test)
print(accuracy_score(y_test , test_pred_rf))

## Gradient Boost Count Vectorizer

In [None]:
model_gb = GradientBoostingClassifier(n_estimators = 100 , random_state = 99).fit(x_train , y_train)
test_pred_gb = model_gb.predict(x_test)
print(accuracy_score(y_test , test_pred_gb))

## TF-IDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
x_train , x_test , y_train , y_test = train_test_split(Docs_clean , df_hotstar['Sentiment_Manual'] , 
                                                       test_size = 0.2 , random_state = 100)
tfidf = TfidfVectorizer(min_df = 5).fit(x_train)
x_train = tfidf.transform(x_train)
x_test = tfidf.transform(x_test)

## Multinomial Naive Bayes using TFIDF Vectorizer

In [None]:
model_mnb = MultinomialNB().fit(x_train , y_train)
test_pred_mnb = model_mnb.predict(x_test)
print(accuracy_score(y_test , test_pred_mnb))

## Word2Vec

In [None]:
docs_vectors = pd.DataFrame() # document-Term Matrix
for doc in Docs_clean:
    words = nltk.word_tokenize(doc)
    temp = pd.DataFrame()
    for word in words:
        try:
            word_vec = embeddings[word]
            temp = temp.append(pd.Series(word_vec) , ignore_index = True)
        except:
            pass
    docs_vectors = docs_vectors.append(temp.mean() , ignore_index = True)
docs_vectors.shape

## Null vectors identification

In [None]:
null_vec = pd.DataFrame(pd.isnull(docs_vectors).sum(axis = 1).sort_values(ascending = False))

In [None]:
null_vec.head()

In [None]:
nl = null_vec.index[null_vec[0]==300].tolist()

In [None]:
len(nl)

In [None]:
x = docs_vectors.drop(nl)
y = df_hotstar['Sentiment_Manual'].drop(nl)

In [None]:
x.shape , y.shape

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 , random_state = 100)

## Random Forest Classifier Word2Vec

In [None]:
model_rf = RandomForestClassifier(n_estimators = 100).fit(x_train , y_train)
test_pred_rf = model_rf.predict(x_test)
print(accuracy_score(y_test , test_pred_rf))

## Ada Boost Classifier Word2Vec

In [None]:
model_ab =AdaBoostClassifier(n_estimators = 100).fit(x_train , y_train)
test_pred_ab = model_ab.predict(x_test)
print(accuracy_score(y_test , test_pred_ab))

## Gradient Boost Classifier Word2Vec

In [None]:
model_gb = GradientBoostingClassifier(n_estimators = 100).fit(x_train , y_train)
test_pred_gb = model_gb.predict(x_test)
print(accuracy_score(y_test , test_pred_gb))

## Sentiment Predicition using VADER

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [None]:
analyzer = SentimentIntensityAnalyzer()

def get_sentiment (sentence , analyzer = analyzer):
    compound = analyzer.polarity_scores(sentence)['compound']
    if compound > 0.1:
        return 'Positive'
    elif compound < 0.1:
        return 'Negative'
    else:
        return 'Neutral'    

In [None]:
df_hotstar = df_hotstar.drop(['Sentiment_Vader'] , axis = 1)

In [None]:
df_hotstar.head(2)

In [None]:
df_hotstar['Sentiment_Vader'] = df_hotstar['Reviews'].apply(get_sentiment)

In [None]:
accuracy_score(df_hotstar['Sentiment_Manual'] , df_hotstar['Sentiment_Vader'])

In [None]:
df_hotstar.head(2)