In [None]:
from flask import Flask, render_template, request, url_for
import sys, ast, os, json
import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob
import seaborn as sns
import numpy as np
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import itertools
import collections
from nltk import bigrams
import networkx as nx
import re
import nltk
import plotly
import plotly.express as px
import plotly.graph_objects as go

In [None]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [None]:
app = Flask(__name__)
app.config['IMAGES_PATH'] = os.path.join('static', 'images')

In [None]:
def clean_tweet(tweet):
    #Step 1 - drop links
    regex = re.compile(r'https?://t.co/[a-zA-Z0-9]{10}')
    remove_links = re.split(regex,tweet)
    remove_links = ' '.join(remove_links)
    
    #Step 2 - Remove any non-ascii characters
    remove_non_ascii =  re.sub(r'[^\x00-\x7F]+',' ', remove_links).lower()

    #Step 3 - check for apostrophes
    remove_apostroph = remove_non_ascii.replace("'s","")
    remove_apostroph = remove_apostroph.replace("'","")
    remove_apostroph = remove_apostroph.split(' ')
    
    #Step 4 - Remove dashes
    remove_dashes = ' '.join(remove_apostroph).split('-')
    remove_dashes = ' '.join(remove_dashes)

    #Step 5 - Keep letters and digits only
    keep_letters =  re.sub(r'[^a-zA-Z0-9]',' ', remove_dashes)

    keep_letters = keep_letters.split(' ')

    keep_letters = ' '.join(keep_letters)

    #Step 6 - Tokenize 
    tokenize = nltk.word_tokenize(keep_letters)

    #Step 7 - Lemmatize    
    lemmatizer= nltk.stem.wordnet.WordNetLemmatizer()
    lemmatize_text = map(lambda x: lemmatizer.lemmatize(x), tokenize)

    #Step 8 - Remove stop words
    stopwords = set(nltk.corpus.stopwords.words('english'))
    remove_stop = list(filter(lambda x: x not in stopwords, lemmatize_text))

    #Step 9 - Remove any empty strings from list
    processed_list = list(filter(lambda a: a != "", remove_stop))

    processed_string = " ".join(processed_list)

    return processed_string

def sent_to_list(sentence):
    return sentence.split(" ")

def sentiment_analyze(polarity):
    if (polarity < 0.1 and polarity > -0.1): return 'neutral'
    elif polarity >= 0.1: return 'positive'
    else: return 'negative'

In [None]:
df = pd.read_csv('final_tweets.csv')
df['Year'] = df['Date'].apply(lambda r: r.split('/')[2])
df = df[df['Year'] == '2021']
df.drop(columns=['Year'], inplace=True)
df['Date'] = df['Date'].apply(lambda r: r.split('/')[0])
df['Tweet'] = df['Tweet'].apply(lambda tweet: clean_tweet(tweet))
df['Words_list'] = df['Tweet'].apply(lambda tweet: sent_to_list(tweet))

sentiment_objs = [TextBlob(tweet) for tweet in df['Tweet']]
all_senti =[tweet.sentiment.polarity for tweet in sentiment_objs]
sentiment_vals = [[tweet.sentiment.polarity, str(tweet)] for tweet in sentiment_objs]
sentiment_df = pd.DataFrame(sentiment_vals, columns=["polarity", "tweet"])

neg = sentiment_df['polarity'].astype('float')
neg = neg[neg <= -0.1]

pos = sentiment_df['polarity'].astype('float')
pos = pos[pos >= 0.1]

neut = sentiment_df['polarity'].astype('float')
neut = neut[neut > -0.1]
neut = neut[neut < 0.1]
df['Sentiment'] = sentiment_df['polarity']
df['Sentiment'] = df['Sentiment'].apply(lambda x: sentiment_analyze(x))

df1 = df
polarity_by_month = df1[['Date','Tweet','Sentiment']].groupby(['Date','Sentiment'])['Tweet'].count().reset_index().rename(columns={'Tweet':'Tweet_Count'})

keywords_by_date = df[['Keyword', 'Date']]
keywords_by_date = keywords_by_date[['Keyword', 'Date']].groupby(['Date', 'Keyword'])['Keyword'].count()

count = 0
month = 0
final_vals = []
for i in range(0,len(keywords_by_date)):
  keyword = "Covid" if count%2==0 else "Corona"
  month = month + 1 if count%2==0 else month
  count = count + 1
  counts = keywords_by_date[i]
  current = [month, keyword, counts]
  final_vals.append(current)

df2 = pd.DataFrame(final_vals, columns=['Month', 'Keyword', 'Keyword_Count'])
w2v_model = Word2Vec()
w2v_model.build_vocab(df['Words_list'].tolist())
w2v_model.train(df['Words_list'].tolist(), total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

bigram_terms = [list(bigrams(tweet)) for tweet in df['Words_list'].tolist()]

In [None]:
# import plotly.figure_factory as ff

# sentiment_objs = [TextBlob(tweet) for tweet in df['Tweet']]
# all_senti =[tweet.sentiment.polarity for tweet in sentiment_objs]
# pos_count = 0
# neg_count = 0
# neu_count = 0

# sentiment_vals = [[tweet.sentiment.polarity, str(tweet)] for tweet in sentiment_objs] 

# sentiment_df = pd.DataFrame(sentiment_vals, columns=["polarity", "tweet"])
# x = sentiment_df['polarity'].values
# fig = sns.displot(x, color='red', kde=True)
# mean = sentiment_df['polarity'].mean()
# plt.axvline(mean, 0, 1, color = 'blue')

# plt.ylabel('Frequency')
# plt.xlabel('Polarity')

# plt.axis([-1, 1, 0, 400])
# plt.savefig('Frequency-Polarity.png')

In [None]:
# def tsne_scatter(model, word, list_names):
#     arrays = np.empty((0,100), dtype='f')
#     word_labels = [word]
#     color_ls = ['brown']
    
    
#     arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
#     close_words = model.wv.most_similar([word])
    
#     for word_score in close_words:
#         word_vec = model.wv.__getitem__([word_score[0]])
#         word_labels.append(word_score[0])
#         color_ls.append('blue')
#         arrays = np.append(arrays, word_vec, axis=0)
        
#     for word in list_names:
#         word_vec = model.wv.__getitem__([word])
#         word_labels.append(word)
#         color_ls.append('magenta')
#         arrays = np.append(arrays, word_vec, axis=0)
    
#     reduc = PCA().fit_transform(arrays)
    
#     np.set_printoptions(suppress=True)
#     Y = TSNE(n_components=2, random_state=0, perplexity=10).fit_transform(reduc)
    
#     df3 = pd.DataFrame({'x':[x for x in Y[:, 0]],
#                        'y':[y for y in Y[:,1]],
#                        'Words': word_labels,
#                        'color': color_ls})
#     fig, _ = plt.subplots()
#     fig.set_size_inches(10,10)
    
#     p = sns.regplot(data=df3,
#                    x='x',
#                    y='y',
#                    fit_reg=False,
#                    marker='x',
#                    scatter_kws={'s':40,
#                                'facecolors':df3['color']}
#                    )
    
#     for line in range(0, df3.shape[0]):
#         p.text(df3['x'][line],
#               df3['y'][line],
#               ' ' + df3['Words'][line].title(),
#               horizontalalignment='left',
#               verticalalignment='bottom', size = 'medium',
#               color = df3['color'][line],
#               weight='normal',
#               ).set_size(15)
    
#     plt.xlim(Y[:, 0].min() - 50, Y[:, 0].max()+50)
#     plt.ylim(Y[:, 1].min() - 50, Y[:, 1].max()+50)
    
#     plt.title("t-SNE visulization")
#     plt.savefig('t-sne.png')

In [None]:
# def plot_bigram(bigram_terms):
#     bigrams = list(itertools.chain(*bigram_terms))
#     bigrams_count = collections.Counter(bigrams)

#     bg_df = pd.DataFrame(bigrams_count.most_common(19), columns=['Bigrams', 'Count'])
#     bg_df = bg_df.drop(labels=[1,2,3,10], axis=0)

#     d = bg_df.set_index('Bigrams').T.to_dict('dict_records')

#     G = nx.Graph()

#     for k in d:
#         G.add_edge(k[0],k[1],weight=(d[k]['Count']*5))

#     fig, ax = plt.subplots(figsize = (12,10))
#     pos = nx.spring_layout(G, k=4)

#     nx.draw_networkx(G, pos,
#                     font_size=10,
#                     width=2,
#                     edge_color='brown',
#                     node_color='#008080',
#                     with_labels=False,
#                     ax=ax)

#     for key, value in pos.items():
#         x,y = value[0]+.135, value[1]+.045
#         ax.text(x,y,
#               s=key,
#               bbox=dict(facecolor='black', alpha=0.25),
#               horizontalalignment='center', fontsize=10)

#     plt.title('Most occuring bigrams in the Tweets')
#     plt.savefig('bigrams.png')

In [None]:
# tsne_scatter(w2v_model, 'covid19', ['corona', 'infection'])

In [None]:
# plot_bigram(bigram_terms)

In [None]:
tweet_polarity = px.bar(df,
                          x=['Positive', 'Neutral', 'Negative'],
                          y=[len(pos), len(neut), len(neg)],
                          labels = {'x':'Distribution of Sentiments', 'y':'Number of Tweets'}
                        )
tw_polarity = px.line(polarity_by_month, x='Date', y='Tweet_Count', color='Sentiment', color_discrete_sequence=['brown','orange','green'])
f = px.line(df2, x='Month', y='Keyword_Count', color='Keyword', color_discrete_sequence=['brown', 'magenta'])

In [None]:
@app.route("/", methods=['GET', 'POST'])
def home_page():
#     filename1 = os.path.join(app.config['IMAGES_PATH'], 'Frequency-Polarity.png')
#     filename2 = os.path.join(app.config['IMAGES_PATH'], 't-sne.png')
#     filename3 = os.path.join(app.config['IMAGES_PATH'], 'bigrams.png')
    graph1JSON = json.dumps(tweet_polarity, cls=plotly.utils.PlotlyJSONEncoder)
    graph2JSON = json.dumps(tw_polarity, cls=plotly.utils.PlotlyJSONEncoder)
    graph3JSON = json.dumps(f, cls=plotly.utils.PlotlyJSONEncoder)
    return render_template('blockcontent.html',
                           graph1JSON = graph1JSON, 
                           graph2JSON = graph2JSON, 
                           graph3JSON = graph3JSON)

In [None]:
if __name__ == '__main__':
    app.run(debug=False)