In [18]:
import pandas as pd
from textblob import TextBlob, Word, Blobber
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import numpy as np
import re, string, timeit
import wordninja
from sklearn.feature_extraction.text import CountVectorizer
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [19]:
emo_data=pd.read_csv(r"C:\Users\WELCOME\Downloads\data\text_emo_data.csv",index_col=False)

In [20]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [21]:
disgust_text=emo_data[emo_data.overall_emo=='avgLexVal_disgust'].text.str.cat(sep=' ')

In [24]:
joy_text=emo_data[emo_data.overall_emo=='avgLexVal_joy'].text.str.cat(sep=' ')

In [29]:
print("Frequent Words disgust text")
common_words = get_top_n_words(emo_data[emo_data.overall_emo=='avgLexVal_disgust'].text, 20)
total=0
for word, freq in common_words:

    print(word, freq)
    
    total=total+freq
for word, freq in common_words:
    perc=(freq/len(disgust_text))*100
    print(word,str(perc)+"%")
df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words  from disgust corpus')

Frequent Words disgust text
covid 109
cough 43
rash 41
air 32
symptom 31
pollution 31
study 21
people 18
infection 17
disease 17
virus 16
viral 12
like 12
link 12
vaccine 12
cause 12
report 12
lung 12
patient 11
london 11
covid 0.6252150969370196%
cough 0.24664448778249398%
rash 0.23517265114144772%
air 0.1835493862567397%
symptom 0.17781346793621658%
pollution 0.17781346793621658%
study 0.12045428473098543%
people 0.10324652976941608%
infection 0.09751061144889298%
disease 0.09751061144889298%
virus 0.09177469312836985%
viral 0.06883101984627739%
like 0.06883101984627739%
link 0.06883101984627739%
vaccine 0.06883101984627739%
cause 0.06883101984627739%
report 0.06883101984627739%
lung 0.06883101984627739%
patient 0.06309510152575427%
london 0.06309510152575427%


In [30]:
print("Frequent Words joy text")
common_words = get_top_n_words(emo_data[emo_data.overall_emo=='avgLexVal_joy'].text, 20)
total=0
for word, freq in common_words:

    print(word, freq)
    
    total=total+freq
for word, freq in common_words:
    perc=(freq/len(joy_text))*100
    print(word,str(perc)+"%")
df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words  from joy corpus')

Frequent Words joy text
say 34
covid 28
test 25
people 24
big 13
travel 13
event 13
like 12
year 12
ask 12
child 12
free 12
make 11
bbc 11
pride 11
social 10
holiday 10
england 9
want 9
know 8
say 0.22957461174881838%
covid 0.18906144496961513%
test 0.1688048615800135%
people 0.16205266711681296%
big 0.08777852802160703%
travel 0.08777852802160703%
event 0.08777852802160703%
like 0.08102633355840648%
year 0.08102633355840648%
ask 0.08102633355840648%
child 0.08102633355840648%
free 0.08102633355840648%
make 0.07427413909520594%
bbc 0.07427413909520594%
pride 0.07427413909520594%
social 0.0675219446320054%
holiday 0.0675219446320054%
england 0.06076975016880487%
want 0.06076975016880487%
know 0.05401755570560432%


## Bigrams

In [31]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2),stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [32]:
print("Frequent Bigrams for disgust corpus")
common_words = get_top_n_bigram(emo_data[emo_data.overall_emo=='avgLexVal_disgust'].text, 20)
total=0
for word, freq in common_words:

    print(word, freq)
    
    total=total+freq
for word, freq in common_words:
    perc=(freq/len(disgust_text))*100
    print(word,str(perc)+"%")
df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams for disgust corpus')

Frequent Bigrams for disgust corpus
air pollution 31
covid rash 12
long covid 10
viral load 8
covid vaccine 7
pollution covid 7
covid infection 6
chronic cough 6
symptom covid 6
exposure air 6
college london 5
common symptom 5
persistent cough 5
lung disease 5
follow covid 5
rash covid 5
allergic reaction 5
study air 5
ongoing cough 4
covid symptom 4
air pollution 0.17781346793621658%
covid rash 0.06883101984627739%
long covid 0.05735918320523115%
viral load 0.045887346564184923%
covid vaccine 0.04015142824366181%
pollution covid 0.04015142824366181%
covid infection 0.034415509923138694%
chronic cough 0.034415509923138694%
symptom covid 0.034415509923138694%
exposure air 0.034415509923138694%
college london 0.028679591602615576%
common symptom 0.028679591602615576%
persistent cough 0.028679591602615576%
lung disease 0.028679591602615576%
follow covid 0.028679591602615576%
rash covid 0.028679591602615576%
allergic reaction 0.028679591602615576%
study air 0.028679591602615576%
ongoing co

In [33]:
print("Frequent Bigrams for joy corpus")
common_words = get_top_n_bigram(emo_data[emo_data.overall_emo=='avgLexVal_joy'].text, 20)
total=0
for word, freq in common_words:

    print(word, freq)
    
    total=total+freq
for word, freq in common_words:
    perc=(freq/len(joy_text))*100
    print(word,str(perc)+"%")
df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams for joy corpus')

Frequent Bigrams for joy corpus
covid test 8
free covid 6
dame rachel 4
pcr test 4
channel islands 4
islands pride 4
covid case 4
tour operator 4
offer free 4
social prescribe 3
social prescribing 3
learn disability 3
big ask 3
year say 3
film tonight 3
social prescription 2
health wellbeing 2
play key 2
key role 2
nhs hop 2
covid test 0.05401755570560432%
free covid 0.04051316677920324%
dame rachel 0.02700877785280216%
pcr test 0.02700877785280216%
channel islands 0.02700877785280216%
islands pride 0.02700877785280216%
covid case 0.02700877785280216%
tour operator 0.02700877785280216%
offer free 0.02700877785280216%
social prescribe 0.02025658338960162%
social prescribing 0.02025658338960162%
learn disability 0.02025658338960162%
big ask 0.02025658338960162%
year say 0.02025658338960162%
film tonight 0.02025658338960162%
social prescription 0.01350438892640108%
health wellbeing 0.01350438892640108%
play key 0.01350438892640108%
key role 0.01350438892640108%
nhs hop 0.01350438892640108