# Load basic packages/modules

In [1]:
import os
import numpy as np
import pandas as pd

import nltk
from nltk import corpus, tokenize
from nltk.corpus import stopwords

import re

from nltk.stem import PorterStemmer, WordNetLemmatizer, porter
from wordcloud import WordCloud, STOPWORDS

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import string

from sklearn.model_selection import train_test_split

# ML
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# DNN/MLP/ANN model

# Load Review Dataset

In [25]:
hotstar = pd.read_csv("hotstar_reviews.csv")
hotstar.head()

Unnamed: 0,ID,UserName,Created_Date,Reviews,Lower_Case_Reviews,Sentiment_Manual_BP,Sentiment_Manual,Review_Length,DataSource,Year,Month,Date,Sentiment_Polarity
0,1,,08-10-2017,Hh,hh,Negative,Negative,2,Google_PlayStore,2017,8,10,Neutral
1,2,,08-11-2017,No,no,Negative,Negative,2,Google_PlayStore,2017,8,11,Neutral
2,3,asadynwa,08-12-2017,@hotstar_helps during paymnt for premium subsc...,@hotstar_helps during paymnt for premium subsc...,Help,Negative,140,Twitter,2017,8,12,Negative
3,4,jineshroxx,08-11-2017,@hotstartweets I am currently on Jio network a...,@hotstartweets i am currently on jio network a...,Help,Negative,140,Twitter,2017,8,11,Positive
4,5,YaminiSachar,08-05-2017,@hotstartweets the episodes of Sarabhai vs Sar...,@hotstartweets the episodes of sarabhai vs sar...,Help,Negative,140,Twitter,2017,8,5,Neutral


In [3]:
hotstar.shape

(5053, 13)

In [4]:
hotstar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5053 entries, 0 to 5052
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   5053 non-null   int64 
 1   UserName             4331 non-null   object
 2   Created_Date         5053 non-null   object
 3   Reviews              5053 non-null   object
 4   Lower_Case_Reviews   5053 non-null   object
 5   Sentiment_Manual_BP  5053 non-null   object
 6   Sentiment_Manual     5053 non-null   object
 7   Review_Length        5053 non-null   int64 
 8   DataSource           5053 non-null   object
 9   Year                 5053 non-null   int64 
 10  Month                5053 non-null   int64 
 11  Date                 5053 non-null   int64 
 12  Sentiment_Polarity   5053 non-null   object
dtypes: int64(5), object(8)
memory usage: 513.3+ KB


In [5]:
hotstar.Sentiment_Manual.value_counts()

Neutral     1738
Positive    1733
Negative    1582
Name: Sentiment_Manual, dtype: int64

In [7]:
hotstar.Sentiment_Manual.value_counts()/hotstar.Sentiment_Manual.size*100

Neutral     34.395409
Positive    34.296458
Negative    31.308134
Name: Sentiment_Manual, dtype: float64

In [8]:
hotstar.DataSource.value_counts()/hotstar.DataSource.size*100

Twitter             55.927172
Google_PlayStore    44.072828
Name: DataSource, dtype: float64

In [10]:
pd.pivot_table(hotstar, index='Sentiment_Manual', columns='DataSource',
              values='ID', aggfunc='count')/hotstar.DataSource.size*100

DataSource,Google_PlayStore,Twitter
Sentiment_Manual,Unnamed: 1_level_1,Unnamed: 2_level_1
Negative,12.923016,18.385118
Neutral,5.125668,29.269741
Positive,26.024144,8.272313


In [11]:
hotstar.columns

Index(['ID', 'UserName', 'Created_Date', 'Reviews', 'Lower_Case_Reviews',
       'Sentiment_Manual_BP', 'Sentiment_Manual', 'Review_Length',
       'DataSource', 'Year', 'Month', 'Date', 'Sentiment_Polarity'],
      dtype='object')

# Data Cleaning

In [26]:
review_data = hotstar[['Lower_Case_Reviews','DataSource','Sentiment_Manual']]
review_data.head()

Unnamed: 0,Lower_Case_Reviews,DataSource,Sentiment_Manual
0,hh,Google_PlayStore,Negative
1,no,Google_PlayStore,Negative
2,@hotstar_helps during paymnt for premium subsc...,Twitter,Negative
3,@hotstartweets i am currently on jio network a...,Twitter,Negative
4,@hotstartweets the episodes of sarabhai vs sar...,Twitter,Negative


In [27]:
review_data.columns = ['Reviews','Source','Sentiment']
review_data.head()

Unnamed: 0,Reviews,Source,Sentiment
0,hh,Google_PlayStore,Negative
1,no,Google_PlayStore,Negative
2,@hotstar_helps during paymnt for premium subsc...,Twitter,Negative
3,@hotstartweets i am currently on jio network a...,Twitter,Negative
4,@hotstartweets the episodes of sarabhai vs sar...,Twitter,Negative


In [14]:
review_data.describe(include='object')

Unnamed: 0,Reviews,Source,Sentiment
count,5053,5053,5053
unique,5053,2,3
top,hh,Twitter,Neutral
freq,1,2826,1738


In [15]:
review_data.Source.value_counts()

Twitter             2826
Google_PlayStore    2227
Name: Source, dtype: int64

In [28]:
review_data['Source'] = review_data['Source'].astype('category')
review_data['Source'] = review_data['Source'].cat.codes
review_data['Source'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_data['Source'] = review_data['Source'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_data['Source'] = review_data['Source'].cat.codes


1    2826
0    2227
Name: Source, dtype: int64

In [29]:
review_data.head()

Unnamed: 0,Reviews,Source,Sentiment
0,hh,0,Negative
1,no,0,Negative
2,@hotstar_helps during paymnt for premium subsc...,1,Negative
3,@hotstartweets i am currently on jio network a...,1,Negative
4,@hotstartweets the episodes of sarabhai vs sar...,1,Negative


In [44]:
punctuation = list(string.punctuation)

stop_words = stopwords.words('english')

re_pattern = """@[a-zA-Z0-9_:]+|b['"]rt|[\d]+[a-zA-Z_+='?]+[\d]+[\d]+|[a-zA-Z_*+=]+[\d]+[a-zA-Z_*+-=]+|[\d]+"""

re_pattern = re_pattern + """|https:+[a-zA-Z0-9/._+-=]+|&amp;|rt"""

reviewText = [re.sub(pattern = re_pattern, string = text, repl="") 
               for text in review_data.Reviews.map(str).values]

In [46]:
re_pattern

'@[a-zA-Z0-9_:]+|b[\'"]rt|[\\d]+[a-zA-Z_+=\'?]+[\\d]+[\\d]+|[a-zA-Z_*+=]+[\\d]+[a-zA-Z_*+-=]+|[\\d]+|https:+[a-zA-Z0-9/._+-=]+|&amp;|rt'

In [48]:
print(reviewText[7])

 why today s epi of #lovekahaiintezaar nt available on available now it was in d morning now showing nt available due expiry


In [None]:
# re.sub('[^a-zA-Z]',' ',Review[i])

In [None]:
# @hotstar_helps during paymnt for premium subscription the transaction failed twice but i have not 
# received refund for one of the transaction

In [49]:
review_data_cleaned = []

for review in reviewText:
    stop_free = " ".join([txt for txt in review.lower().split() if txt not in stop_words])
    # stop_words - NLTK
    stop_free_1 = " ".join([txt for txt in stop_free.lower().split() if txt not in STOPWORDS])
    # STOPWORDS - WORDCLOUD
    puct_free = " ".join([txt for txt in stop_free_1.lower().split() if txt not in punctuation])
    review_data_cleaned.append(puct_free)

In [64]:
review_data_cleaned[7]

'today epi #lovekahaiintezaar nt available available morning showing nt available due expiry'

In [65]:
# remove hashtags
review_data_cleaned_final = []

for rdcf in review_data_cleaned:
    final_words = rdcf.replace("#",'')
    review_data_cleaned_final.append(final_words)


In [66]:
review_data_cleaned_final[7]

'today epi lovekahaiintezaar nt available available morning showing nt available due expiry'

In [68]:
review_data_cleaned_final[57]

'need buy rs premier league pack compulsory watch pl matches already premium pack activated'

In [52]:
len(review_data_cleaned)

5053

In [69]:
"""
corpus = []
for i in range(len(review_data_cleaned)):
    review = re.sub('[^a-zA-Z]',' ',review_data_cleaned[i])
    review = ''.join(review)
    corpus.append(review)
    
"""

"\ncorpus = []\nfor i in range(len(review_data_cleaned)):\n    review = re.sub('[^a-zA-Z]',' ',review_data_cleaned[i])\n    review = ''.join(review)\n    corpus.append(review)\n    \n"

In [70]:
# Lemmatization

wd = WordNetLemmatizer()

review_data_cleaned_final_output = []
for rdc in review_data_cleaned_final:
    clean_review = " ".join(wd.lemmatize(word) for word in rdc.split())
    review_data_cleaned_final_output.append(clean_review)

In [71]:
review_data_cleaned_final_output

['hh',
 '',
 'paymnt premium subscription transaction failed twice received refund one transaction',
 'currently jio network know whether able watch epl telecasted star spos select hd',
 'episode sarabhai v sarabhai season downloadable able watch offline please smthng',
 'able watch latest episode got app allow take screenshot error help resolve asap',
 'please allow rupay maestro payment gateway premium membership mean paytm work thru debit card great',
 'today epi lovekahaiintezaar nt available available morning showing nt available due expiry',
 'hotstarfraud paid subscription july havent received cashback specified hdfc card',
 'premium accnt hotstar showing tht premium member u pls chk',
 'seeing blank page term amp condition hdfc bank cashback offer hotstar premium membership please help',
 'sir please allow u download video ur app present option allow u dwnld mre video due ltd space',
 'hi pl tab spos homepage isl bundesliga search team name stream pls look',
 'unable watch star

In [72]:
review_data['clean_review'] = review_data_cleaned_final_output

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_data['clean_review'] = review_data_cleaned_final_output


In [73]:
review_data.head()

Unnamed: 0,Reviews,Source,Sentiment,clean_review
0,hh,0,Negative,hh
1,no,0,Negative,
2,@hotstar_helps during paymnt for premium subsc...,1,Negative,paymnt premium subscription transaction failed...
3,@hotstartweets i am currently on jio network a...,1,Negative,currently jio network know whether able watch ...
4,@hotstartweets the episodes of sarabhai vs sar...,1,Negative,episode sarabhai v sarabhai season downloadabl...


In [74]:
# split the data into training and test

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(review_data['clean_review'],
                                                review_data['Sentiment'],test_size=0.2,
                                                   random_state=42,
                                                   stratify=review_data['Sentiment'])

In [77]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(4042,) (1011,) (4042,) (1011,)
