# Read Necessary Files & Feature Engineering

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
import pickle
import ast
import bitermplus as btm
from sklearn.feature_extraction import text

In [4]:
whole_india_df = pd.read_csv("demo files/india-v1-preprocessed-overall.csv")
filtered_india_df = pd.read_csv('demo files/sentiment_labelled_india_NB.csv')

In [5]:
#For obj & sub
v = CountVectorizer()
X_whole_data = v.fit_transform(whole_india_df.translated)

In [6]:
#For topic modelling - BITERM
texts = filtered_india_df['words_processed_noun']

new_text_noun = []

for row in texts:
    new_row = ast.literal_eval(row)
    new_row = " ".join(new_row)
    new_text_noun.append(new_row)
    
filtered_india_df['new_text_noun'] = new_text_noun
texts = filtered_india_df['new_text_noun']

stop_words = ['vaccine','people','couid', 'lot', 'thing', 'amp', 'day', 'week', 'time', 'year', 'vaccination',
             'month', 'number', 'part', 'hour', 'shit', 'person', 'go', 'pfizer','room', 'man', 'word', 'other', 
              'point', 'today', 'way', 'yesterday', 'lot', 'one', 'need', 'love', 'covidvaccine', 'use', 'bit',
             'idiot', 'thank', 'shot', 'tomorrow', 'dose', 'mask', 'life']

def get_dominant_df(p_zd):
    scores = [[]]
    
    for score in p_zd[0]:
        scores[0].append(score)
        
    topicnames = ["Education", "Healthcare Sector", "Covid Cases Updates", "How to stay safe", "Travelling", "Economic/Political Impact", 
                  "Vaccination Appointment","Side Effects/Symptoms", "Undetermined"]
    docnames = ["Tweet"]
    df = pd.DataFrame(np.round(scores, 5), columns=topicnames, index=docnames)
    
    if np.amax(df.values) > 0.3:
        dominant_topic = topicnames[np.argmax(df.values, axis=1)[0]]
        df['dominant_topic'] = dominant_topic
    else:
        df['dominant_topic'] = "Unclassified"
    
    return df

In [7]:
#For topic modelling - GSDMM
def get_dominant_df_gsdmm(a_list):
    scores = [[]]
    
    for score in a_list:
        scores[0].append(score)
        
    topicnames = ["Covid Cases Updates", "Economic/Political Impact", "Vaccination Appointment", "Travelling", "Side Effects/Symptoms","Education"]
    docnames = ["Tweet"]
    df = pd.DataFrame(np.round(scores, 5), columns=topicnames, index=docnames)
    dominant_topic = topicnames[np.argmax(df.values, axis=1)[0]]
    df['dominant_topic'] = dominant_topic
    
    return df

In [8]:
#For sentiment classification
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
whole_data = cv.fit_transform(filtered_india_df['new_text'].values.astype('U'))

def get_sentiment(value):
    if value == 0:
        return "NEGATIVE"
    elif value == "1":
        return "NEUTRAL"
    else:
        return "POSITIVE"

In [9]:
tweet = ["so if you all keep asking us do rtpcr everytime, even after being fully vaccinated- what has point yes, you can get covid even after vaccination but if u have do test everytime u travel even within state, it is all never ending, money eating business loop."]

# Objective & Subjective Classification

In [10]:
model_saved = open("pickles/india/india_subj.pickle", "rb") #binary read
model = pickle.load(model_saved)
model_saved.close()

In [11]:
tweet_transformed = v.transform(tweet)

In [21]:
predicted_class = model.predict(tweet_transformed)
print("The predicted class is: " + str(predicted_class[0]).upper())

# Topic Modelling

## Biterm

In [13]:
model_saved = open("pickles/india/india_biterm.model", "rb") #binary read
model = pickle.load(model_saved)
model_saved.close()

In [23]:
tweet_noun_strlist = [" ".join(ast.literal_eval(filtered_india_df[filtered_india_df["translated"] == tweet[0]]["words_processed_noun"].iloc[0]))]
tweet_noun_strlist

In [22]:
X, vocabulary, vocab_dict = btm.get_words_freqs(texts, stop_words=stop_words)
new_docs_vec = btm.get_vectorized_docs(tweet_noun_strlist, vocabulary)
p_zd = model.transform(new_docs_vec)
dominant_df = get_dominant_df(p_zd)
dominant_df

In [24]:
print("The dominant topic is: " + str(dominant_df["dominant_topic"][0]))

## GSDMM

In [1]:
# model_saved = open("pickles/india_gsdmm.pickle", "rb") #binary read
# model = pickle.load(model_saved)
# model_saved.close()

In [2]:
# tweet_nouns_list = ast.literal_eval(filtered_india_df[filtered_india_df["translated"] == tweet[0]]["words_processed_noun"].iloc[0])
# tweet_nouns_list

In [3]:
# dominant_df = get_dominant_df_gsdmm(model.score(tweet_nouns))
# dominant_df

In [4]:
# print("The dominant topic is: " + str(dominant_df["dominant_topic"][0]))

# Sentiment Analysis

In [18]:
model_saved = open("pickles/india/india_sentiment.model", "rb") #binary read
model = pickle.load(model_saved)
model_saved.close()

In [19]:
tweet_transformed = cv.transform(tweet)

In [25]:
predicted_class = model.predict(tweet_transformed)
sentiment = get_sentiment(predicted_class[0])
print("The predicted class is: " + sentiment)