# Hexaware Submission (Senti-Group)

###In this python notebook we have provided the code of how our model was trained and how data was collected for every dashboard that we have made

## Install and Import necessary Packages

In [None]:
!pip install psycopg2-binary
!pip install praw

In [None]:
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

# Utility
import re
import numpy as np
import os
import time

# Praw
import praw
import datetime
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, DateTime, Float
from sqlalchemy.orm import sessionmaker

# Tweepy
import tweepy

# TextBlob
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words('english'))

##Sentiment Analysis Model

Define Network Parameters

In [None]:
#NETWORK PARAMETERS
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S+|[^A-Za-z0-9]+"

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 10
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

Define Helper Functions

In [None]:
def map_sentiment(label):
  decode_map = {0: "NEGATIVE", 4: "POSITIVE"}
  return decode_map[int(label)]

def clean(text, stem=False):
  text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
  tokens = []
  for token in text.split():
    tokens.append(token)
  return " ".join(tokens)

def preprocess_data():
  train = pd.read_csv('training.1600000.processed.noemoticon.csv',
                    encoding="ISO-8859-1",
                    names=["target", "ids", "date", "flag", "user", "text"]
                    )
  train = train.iloc[:,[0,5]]
  train.target = train.target.apply(lambda x:map_sentiment(x))
  train.text = train.text.apply(lambda x: clean(x))
  df_train, df_test = train_test_split(train, test_size=0.2, random_state=42)
  return df_train,df_test

def tokenize(df_train):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(df_train.text)
  vocab_size = len(tokenizer.word_index)+1
  return tokenizer,vocab_size

def data2array(tokenizer):
  x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text),
                          maxlen=SEQUENCE_LENGTH)
  x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text),
                        maxlen=SEQUENCE_LENGTH)
  encoder = LabelEncoder()
  encoder.fit(df_train.target.tolist())
  y_train = encoder.transform(df_train.target.tolist())
  y_test = encoder.transform(df_test.target.tolist())
  y_train = y_train.reshape(-1,1)
  y_test = y_test.reshape(-1,1)
  return x_train, x_test, y_train, y_test

def get_callbacks(path):
  callback1 = ReduceLROnPlateau(monitor='val_loss',patience=3, cooldown=0)
  callback2 = EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)
  callback3 = ModelCheckpoint(path,
                              monitor='val_loss',save_best_only=True)
  callbacks = [callback1,callback2,callback3]
  return callbacks

def score(model,x_test,y_test):
  score = model.evaluate(x_test,y_test,batch_size=BATCH_SIZE)
  print()
  print("ACCURACY:",score[1])
  print("LOSS:",score[0])

def plot_history(history):  #plot the trend in accuracy and loss
  acc = history.history['accuracy']
  val_acc = history.history['val_accuracy']
  loss = history.history['loss']
  val_loss = history.history['val_loss']
  epochs = range(len(acc))
  plt.plot(epochs,acc, 'b', label='Training accuracy')
  plt.plot(epochs,val_acc, 'r', label='Validation accuracy')
  plt.title('Training and Validation accuracy')
  plt.legend()
  plt.figure()
  plt.plot(epochs,loss, 'b', label='Training loss')
  plt.plot(epochs,val_loss, 'r', label='Validation loss')
  plt.title('Training and Validation loss')
  plt.legend()
  plt.show()

def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE
        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

def predict(text, include_neutral=True):
  start_at = time.time()
  text = clean(text)
  x_test = pad_sequences(tokenizer.texts_to_sequences([text]),
                         maxlen=SEQUENCE_LENGTH)
  score = model.predict([x_test])[0]
  label = decode_sentiment(score, include_neutral=include_neutral)
  return {"label":label, "score": float(score),
          "elapsed_time": time.time()-start_at}


Load Training data and Preprocess it in the form which we can feed into our Model

In [None]:
!unzip training_data.zip

In [None]:
df_train, df_test = preprocess_data()

In [None]:
tokenizer,vocab_size = tokenize(df_train)

In [None]:
x_train,x_test,y_train,y_test = data2array(tokenizer)

Load our pretrained Model

In [None]:
model = keras.models.load_model('model_weights.h5')
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

Train a new Model and provide path where you want model weights to be saved

In [None]:
save_weights = 'model_weights_2.h5'
callbacks = get_callbacks(save_weights)

In [None]:
history = model.fit(x_train,y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split= 0.1,
                    verbose=1,
                    callbacks=callbacks
                    )

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [None]:
score(model,x_test,y_test)

In [None]:
plot_history(history)

In [None]:
predict("i can't enjoy this")

##Cross Platform Analysis

Authenticate reddit api and provide Database URI 

In [None]:
reddit = praw.Reddit(client_id="",
                     client_secret="",
                     password="",
                     user_agent="",
                     username="")

In [None]:
DATABASE_URI = 'postgres://nskzjpswzdujzd:d0d7caf61b738c552483e356091a6a361bfc2207b6ddd5da993a85963f1dc366@ec2-52-202-22-140.compute-1.amazonaws.com:5432/db3js444ilgfce'
engine = create_engine(DATABASE_URI)
Base = declarative_base()

Define schema of our SQL table

In [None]:
class Comment(Base):
  __tablename__ = 'comments'
  id = Column(Integer, primary_key=True)
  Comment = Column(String, nullable=False)
  Created_at = Column(DateTime)
  Sentiment = Column(String(10), nullable=False)
  SentimentScore = Column(Float)
  Platform = Column(String(20),nullable=False)
  def __repr__(self):
    return f"<Comment(created_at={self.Created_at}, sentiment={self.Sentiment}, sentiment_score={self.SentimentScore})>"

In [None]:
Session = sessionmaker(bind=engine)
session = Session()

Stream the mentioned subreddit and capture any new comments posted there. These comments are then added to our online database

In [None]:
platform = 'netflix'

In [None]:
comments = reddit.subreddit(platform).stream.comments(skip_existing=True)
i=0
for comment in comments:
  score = predict(comment.body)
  user = Comment(Comment = comment.body, 
                 Created_at = datetime.datetime.fromtimestamp(comment.created), 
                 Sentiment = score['label'], 
                 SentimentScore = score['score'], 
                 Platform = str(platform).lower()
                 )
  session.add(user)
  session.commit()
  i+=1
  print('Number of comments added: ',i)

##Genre-wise Analysis

Define Helper Functions

In [None]:
def get_tweets(search,number,place):
  tweets = tweepy.Cursor(api.search,q=search+'--place:%s'%place,lang='en').items(number)
  return tweets

Authenticate twitter api

In [None]:
auth = tweepy.OAuthHandler('','')
auth.set_access_token('','')
api = tweepy.API(auth,wait_on_rate_limit=True)

In [None]:
countries = pd.read_excel('countries_with_place_ids.xlsx',dtype='str')

Collect tweets for movies in a specific genre for all countries mentioned in our Excel file

In [None]:
sci_fi = pd.read_excel('SciFiMovieList.xlsx')
genre = 'Sci-Fi'
movie_list = [i for i in sci_fi.iloc[:,0]]
number = 20

In [None]:
tweet_dict = []
for movie in movie_list:
  for i in range(len(countries)):
    tweets = get_tweets(movie,number,countries.place_id[i])
    tweet_dict.append({
        'movie':movie,
        'country':countries.Country[i],
        'tweets':tweets   
    }
                      )

In [None]:
movie_genre = []
country = []
tweet_content = []
creation_time = []
user_age = []

In [None]:
start = time.time()
while(i<len(tweet_dict)):
  tweets = tweet_dict[i]['tweets']
  for tweet in tweets:
    tweet_genre.append(genre)
    tweet_movie.append(tweet_dict[i]['movie'])

    tweet_country.append(tweet_dict[i]['country'])
    tweet_text.append(tweet.text)
    tweet_user_age.append(tweet.user.created_at)
  print(len(tweet_dict)-i-1,'left')
  i+=1
print(time.time()-start)

Add those collected tweets and determine their sentiments. This data is then added to a pandas dataframe and saved in an Excel spreadsheet

In [None]:
genre_data = pd.DataFrame()
genre_data['Genre'] = pd.Series(tweet_genre)
genre_data['Movie'] = pd.Series(tweet_movie)
genre_data['Country'] = pd.Series(tweet_country)
genre_data['Tweet'] = pd.Series(tweet_text)
genre_data['TwitterAge'] = pd.Series(tweet_user_age)

genre_data['Sentiment'] = genre_data.Tweet.apply(lambda x:predict(x)['label'])
genre_data['SentimentScore'] = genre_data.Tweet.apply(lambda x:predict(x)['score'])

In [None]:
genre_data

In [None]:
genre_data.to_excel('SciFi_data.xlsx',index=False)

##User Retention Analysis (13 Reasons Why)



Authenticate reddit api

In [None]:
reddit = praw.Reddit(client_id="",
                     client_secret="",
                     password="",
                     user_agent="",
                     username="")

In [None]:
comment_list = []
created_at_list = []
episode = []

Collect comments from reddit submissions meant for discussing 13 Reasons why

In [None]:
submission_url = 'https://www.reddit.com/r/13ReasonsWhy/comments/cuazdb/s3ep_13_let_the_dead_bury_the_dead/?utm_source=share&utm_medium=ios_app'
season = 3
episode_no = 13
submission = reddit.submission(url=submission_url)
submission.comments.replace_more(limit=0)
submission.comment_sort = "top"
comments = submission.comments.list()
for comment in comments:
  comment_list.append(comment.body)
  created_at_list.append(datetime.datetime.fromtimestamp(comment.created))
  episode.append(str(season)+'.'+str(episode_no))

Determine Sentiment score for collected data and save this to an Excel Spreadsheet

In [None]:
reddit_comments = pd.DataFrame()
reddit_comments['Comment'] = pd.Series(comment_list)
reddit_comments['Created_at'] = pd.Series(created_at_list)
reddit_comments['Episode'] = pd.Series(episode)
reddit_comments['Sentiment'] = reddit_comments.Comment.apply(lambda x:predict(x)['label'])
reddit_comments['SentimentScore'] = reddit_comments.Comment.apply(lambda x:predict(x)['score'])

In [None]:
reddit_comments

In [None]:
reddit_comments.to_excel('13ReasonsWhy.xlsx')

##Analysis using Word Frequency

In [None]:
def get_tuple(start,end,n):
  x = [a for a in range(start,end+1)]
  y = []
  for i in range(end-n+1):
    y.append([x[j] for j in range(i,i+n)])
  return y

def get_word_freq(ngram,df_new):
  wt_words = []
  for i in range(len(df_new)):
    current_grams = []
    x = clean(df_new.loc[i].Comment).split(' ')
    indices = get_tuple(0,len(x),ngram)
    for y in indices:
      lists = ''
      for index in y:
        if x[index] in stop_words:
          continue
        lists += x[index] + ' '
      if len(lists.split())==ngram:
        current_grams.append(lists)
    wt_words += current_grams
  data_analysis = nltk.FreqDist(wt_words)
  filter_words = dict([(m, n) for m, n in data_analysis.items()])
  data_analysis = nltk.FreqDist(filter_words)
  data_analysis.plot(25, cumulative=False)

In [None]:
df = pd.read_excel('/content/sentiment_allplatforms_reddit.xlsx')

filter_words = ['hulu','netflix', 'season', 'black','reality', 'fun', 'prime', 'movie', 'show', 'get', 'would', 'one', 'streaming', 'also']
for x in filter_words :
  stop_words.add(x)

In [None]:
sentiments = ['POSITIVE','NEGATIVE','NEUTRAL']
for i in range(3):
  for n_gram in [1,2,3]:
    print(f'{sentiments[i]}  {n_gram}-Grams')
    df_new = df.query(f'Sentiment=="{sentiments[i]}"')
    df_new.reset_index(drop=True, inplace=True)
    get_word_freq(n_gram,df_new)