In [None]:
import pandas as pd
from textblob import TextBlob

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/M2_MLSD/PPD/cleaned_resultats.csv', on_bad_lines='skip', usecols=['lang', 'created_at', 'state', 'cleaned_texts'])
df.shape

(509372, 4)

In [None]:
df = df.dropna()
df.shape

(487655, 4)

In [None]:
df = df[df.lang == 'en']
df = df.drop(columns=['lang'])
df.shape

(449372, 3)

In [None]:
df = df[df.state != 'USA']
df.shape

(380829, 3)

In [None]:
df['country'] = 'USA'
df['year'] = df['created_at'].apply(lambda x: x.split('-')[0])
df['month'] = df['created_at'].apply(lambda x: x.split('-')[1])
df = df[['country', 'state', 'year', 'month', 'cleaned_texts']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,country,state,year,month,cleaned_texts
0,USA,NJ,2020,4,Don’t drink ladder
1,USA,NJ,2020,4,We expect relapse hopefully contained smaller ...
2,USA,TX,2020,4,Want help great profit local small tshirt busi...
3,USA,NY,2020,4,Me torturing wife quarantine quarantine stay...
4,USA,CA,2020,3,Angela Merkel declares coronavirus biggest cha...


# Sentiments analysis

In [None]:
#Create a function to get the polarity
def getPolarity(text):
  return TextBlob(text).sentiment.polarity

def getAnalysis(score):
  if score < 0:
    return 'Negative'
  elif score == 0:
    return 'Neutral'
  else:
    return 'Positive'

In [None]:
df['sentiment'] = df['cleaned_texts'].apply(getPolarity)
df['sentiment_analysis'] = df['sentiment'].apply(getAnalysis)
df.head()

Unnamed: 0,country,state,year,month,cleaned_texts,sentiment,sentiment_analysis
0,USA,NJ,2020,4,Don’t drink ladder,0.0,Neutral
1,USA,NJ,2020,4,We expect relapse hopefully contained smaller ...,0.0,Neutral
2,USA,TX,2020,4,Want help great profit local small tshirt busi...,0.3375,Positive
3,USA,NY,2020,4,Me torturing wife quarantine quarantine stay...,-0.181818,Negative
4,USA,CA,2020,3,Angela Merkel declares coronavirus biggest cha...,-0.2,Negative


In [None]:
df['Total'] = 1
df_cat = df.pivot(columns='sentiment_analysis', values='Total')
df_cat = df_cat.fillna(0)
df = pd.concat([df, df_cat], axis=1)
df.head()

Unnamed: 0,country,state,year,month,cleaned_texts,sentiment,sentiment_analysis,Total,Negative,Neutral,Positive
0,USA,NJ,2020,4,Don’t drink ladder,0.0,Neutral,1,0.0,1.0,0.0
1,USA,NJ,2020,4,We expect relapse hopefully contained smaller ...,0.0,Neutral,1,0.0,1.0,0.0
2,USA,TX,2020,4,Want help great profit local small tshirt busi...,0.3375,Positive,1,0.0,0.0,1.0
3,USA,NY,2020,4,Me torturing wife quarantine quarantine stay...,-0.181818,Negative,1,1.0,0.0,0.0
4,USA,CA,2020,3,Angela Merkel declares coronavirus biggest cha...,-0.2,Negative,1,1.0,0.0,0.0


# LDA

In [None]:
import gensim
from gensim.utils import simple_preprocess

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [None]:
data_sent = df["cleaned_texts"].values.tolist()
data_words = list(sent_to_words(data_sent))
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# LDA model training
from pprint import pprint
# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                    id2word=id2word,
                                    num_topics=num_topics)

In [None]:
def getLDA(text):
  new_text_corpus =  id2word.doc2bow(text.split())
  liste = lda_model[new_text_corpus]
  return max(liste, key=lambda x:x[1])[0]

df['LDA'] = df['cleaned_texts'].apply(getLDA)
df.head()

Unnamed: 0,country,state,year,month,cleaned_texts,sentiment,sentiment_analysis,Total,Negative,Neutral,Positive,LDA
0,USA,NJ,2020,4,Don’t drink ladder,0.0,Neutral,1,0.0,1.0,0.0,8
1,USA,NJ,2020,4,We expect relapse hopefully contained smaller ...,0.0,Neutral,1,0.0,1.0,0.0,0
2,USA,TX,2020,4,Want help great profit local small tshirt busi...,0.3375,Positive,1,0.0,0.0,1.0,3
3,USA,NY,2020,4,Me torturing wife quarantine quarantine stay...,-0.181818,Negative,1,1.0,0.0,0.0,3
4,USA,CA,2020,3,Angela Merkel declares coronavirus biggest cha...,-0.2,Negative,1,1.0,0.0,0.0,0


# NMF

In [None]:
# Create Document Term Matrix ‘V’

from sklearn.feature_extraction.text import TfidfVectorizer

tv_noun = TfidfVectorizer(ngram_range = (1,1), max_df = .8, min_df = .01)

# Fit and Transform speech noun text to a TF-IDF Doc-Term Matrix
data_tv_noun = tv_noun.fit_transform(df.cleaned_texts)

# Create data-frame of Doc-Term Matrix with nouns as column names
data_dtm_noun = pd.DataFrame(data_tv_noun.toarray(), columns=tv_noun.get_feature_names())

# Set President's Names as Index
data_dtm_noun.index = df.index

# Visually inspect Document Term Matrix
data_dtm_noun.head()



Unnamed: 0,all,also,america,americans,amp,and,another,anyone,around,as,...,what,who,why,work,working,world,would,wuhan,year,you
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.303393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.499982,0.0,0.0,0.0,0.0


In [None]:
# Run NMF on Document Term Matrix ‘V’
from sklearn.decomposition import NMF        

nmf_model = NMF(10)

# Learn an NMF model for given Document Term Matrix 'V' 
# Extract the document-topic matrix 'W'
doc_topic = nmf_model.fit_transform(data_dtm_noun)



In [None]:
import numpy as np

def myfunction(x):
  return np.where(x == max(x))[0][0]

In [None]:
df['NMF'] = np.apply_along_axis(myfunction, axis=1, arr=doc_topic)
df.head()

Unnamed: 0,country,state,year,month,cleaned_texts,sentiment,sentiment_analysis,Total,Negative,Neutral,Positive,LDA,NMF
0,USA,NJ,2020,4,Don’t drink ladder,0.0,Neutral,1,0.0,1.0,0.0,8,9
1,USA,NJ,2020,4,We expect relapse hopefully contained smaller ...,0.0,Neutral,1,0.0,1.0,0.0,0,4
2,USA,TX,2020,4,Want help great profit local small tshirt busi...,0.3375,Positive,1,0.0,0.0,1.0,3,2
3,USA,NY,2020,4,Me torturing wife quarantine quarantine stay...,-0.181818,Negative,1,1.0,0.0,0.0,3,2
4,USA,CA,2020,3,Angela Merkel declares coronavirus biggest cha...,-0.2,Negative,1,1.0,0.0,0.0,0,6


# Tableau

In [None]:
df_lda = df.pivot(columns='LDA', values='Total')
df_lda = df_lda.add_prefix('LDA_')
df_lda = df_lda.fillna(0)
df = pd.concat([df, df_lda], axis=1)
df.head()

Unnamed: 0,country,state,year,month,cleaned_texts,sentiment,sentiment_analysis,Total,Negative,Neutral,...,LDA_0,LDA_1,LDA_2,LDA_3,LDA_4,LDA_5,LDA_6,LDA_7,LDA_8,LDA_9
0,USA,NJ,2020,4,Don’t drink ladder,0.0,Neutral,1,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,USA,NJ,2020,4,We expect relapse hopefully contained smaller ...,0.0,Neutral,1,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,USA,TX,2020,4,Want help great profit local small tshirt busi...,0.3375,Positive,1,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,USA,NY,2020,4,Me torturing wife quarantine quarantine stay...,-0.181818,Negative,1,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,USA,CA,2020,3,Angela Merkel declares coronavirus biggest cha...,-0.2,Negative,1,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_nmf = df.pivot(columns='NMF', values='Total')
df_nmf = df_nmf.add_prefix('NMF_')
df_nmf = df_nmf.fillna(0)
df = pd.concat([df, df_nmf], axis=1)
df.head()

Unnamed: 0,country,state,year,month,cleaned_texts,sentiment,sentiment_analysis,Total,Negative,Neutral,...,NMF_0,NMF_1,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,NMF_9
0,USA,NJ,2020,4,Don’t drink ladder,0.0,Neutral,1,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,USA,NJ,2020,4,We expect relapse hopefully contained smaller ...,0.0,Neutral,1,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,USA,TX,2020,4,Want help great profit local small tshirt busi...,0.3375,Positive,1,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,USA,NY,2020,4,Me torturing wife quarantine quarantine stay...,-0.181818,Negative,1,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,USA,CA,2020,3,Angela Merkel declares coronavirus biggest cha...,-0.2,Negative,1,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 

my_stop_words = text.ENGLISH_STOP_WORDS.union(["covid","coronavirus","corona","virus","amp"])
vec = CountVectorizer(stop_words = my_stop_words, max_features=1000)
X = vec.fit_transform(df['cleaned_texts'].to_list())
temp = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
temp[['country', 'state', 'year', 'month']] = df[['country', 'state', 'year', 'month']]



In [None]:
df_state_month = df.groupby(['country', 'state', 'year', 'month'], as_index=False).sum() 
df_state_month['sentiment'] = df.groupby(['country', 'state', 'year', 'month'], as_index=False).agg({'sentiment': 'mean'})['sentiment']

lda = df.groupby(['country', 'state', 'year', 'month', 'LDA'])['Total'].size().reset_index(level=['country', 'state', 'year', 'month']).groupby(['country', 'state', 'year', 'month'])['Total'].idxmax().to_frame()
liste_lda = lda['Total'].values
df_state_month['LDA'] = liste_lda
df_state_month['LDA'] = df_state_month['LDA'].apply(str)

nmf = df.groupby(['country', 'state', 'year', 'month', 'NMF'])['Total'].size().reset_index(level=['country', 'state', 'year', 'month']).groupby(['country', 'state', 'year', 'month'])['Total'].idxmax().to_frame()
liste_nmf = nmf['Total'].values
df_state_month['NMF'] = liste_nmf
df_state_month['NMF'] = df_state_month['NMF'].apply(str)

df_words = pd.DataFrame() 
df_words[['words','freq']] = temp.groupby(['country', 'state', 'year', 'month']).sum().apply(lambda x: pd.Series([x.nlargest(10).index.values, x.nlargest(10).values]), axis=1)
df_words = df_words.reset_index(level=[0, 1, 2, 3])
df_state_month['words'] = df_words['words']
df_state_month['freq'] = df_words['freq']

df_state_month.head()

Unnamed: 0,country,state,year,month,sentiment,Total,Negative,Neutral,Positive,LDA,...,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,NMF_9,words,freq
0,USA,AK,2020,1,0.103704,9,1.0,4.0,4.0,0,...,2.0,0.0,2.0,1.0,3.0,0.0,1.0,0.0,"[people, zero, like, attention, bit, case, cas...","[4, 3, 2, 1, 1, 1, 1, 1, 1, 1]"
1,USA,AK,2020,2,-0.009299,32,10.0,13.0,9.0,8,...,5.0,0.0,2.0,3.0,2.0,1.0,0.0,3.0,"[flu, like, sick, zero, cdc, people, wuhan, da...","[7, 6, 6, 6, 4, 4, 4, 3, 3, 3]"
2,USA,AK,2020,3,0.076315,255,53.0,105.0,97.0,8,...,17.0,17.0,58.0,9.0,24.0,12.0,12.0,18.0,"[people, case, trump, day, american, new, thin...","[32, 21, 17, 16, 10, 10, 10, 9, 9, 8]"
3,USA,AK,2020,4,0.092931,306,70.0,87.0,149.0,8,...,16.0,11.0,93.0,21.0,12.0,18.0,24.0,20.0,"[know, need, like, time, day, death, people, w...","[22, 22, 20, 17, 16, 16, 15, 15, 14, 13]"
4,USA,AK,2020,5,0.072473,99,22.0,36.0,41.0,8,...,3.0,4.0,25.0,6.0,7.0,9.0,10.0,12.0,"[day, people, like, student, administration, k...","[12, 12, 10, 9, 8, 8, 8, 8, 7, 6]"


In [None]:
df_state_2020 = df.groupby(['country', 'state', 'year'], as_index=False).sum() 
df_state_2020['sentiment'] = df.groupby(['country', 'state', 'year'], as_index=False).agg({'sentiment': 'mean'})['sentiment']

lda = df.groupby(['country', 'state', 'year', 'LDA'])['Total'].size().reset_index(level=['country', 'state', 'year']).groupby(['country', 'state', 'year'])['Total'].idxmax().to_frame()
liste_lda = lda['Total'].values
df_state_2020['LDA'] = liste_lda
df_state_2020['LDA'] = df_state_month['LDA'].apply(str)

nmf = df.groupby(['country', 'state', 'year', 'NMF'])['Total'].size().reset_index(level=['country', 'state', 'year']).groupby(['country', 'state', 'year'])['Total'].idxmax().to_frame()
liste_nmf = nmf['Total'].values
df_state_2020['NMF'] = liste_nmf
df_state_2020['NMF'] = df_state_month['NMF'].apply(str)

df_words = pd.DataFrame() 
df_words[['words','freq']] = temp.groupby(['country', 'state', 'year']).sum().apply(lambda x: pd.Series([x.nlargest(10).index.values, x.nlargest(10).values]), axis=1)
df_words = df_words.reset_index(level=[0, 1, 2])
df_state_2020['words'] = df_words['words']
df_state_2020['freq'] = df_words['freq']

df_state_2020.head()

Unnamed: 0,country,state,year,sentiment,Total,Negative,Neutral,Positive,LDA,NMF,...,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,NMF_9,words,freq
0,USA,AK,2020,0.079469,701,156.0,245.0,300.0,0,6,...,43.0,32.0,180.0,40.0,48.0,40.0,47.0,53.0,"[people, day, like, trump, know, case, need, t...","[67, 47, 45, 38, 37, 35, 32, 32, 30, 28]"
1,USA,AL,2020,0.073029,3233,636.0,1238.0,1359.0,8,1,...,455.0,151.0,755.0,195.0,170.0,157.0,223.0,222.0,"[people, pandemic, like, thank, time, trump, f...","[237, 227, 182, 164, 162, 142, 131, 129, 123, ..."
2,USA,AR,2020,0.050253,1602,353.0,608.0,641.0,8,0,...,154.0,72.0,383.0,95.0,91.0,96.0,108.0,117.0,"[people, like, case, need, day, trump, time, h...","[125, 92, 88, 88, 84, 80, 79, 72, 67, 56]"
3,USA,AZ,2020,0.049036,8624,2063.0,3260.0,3301.0,8,4,...,820.0,361.0,2225.0,416.0,537.0,606.0,583.0,583.0,"[people, trump, like, need, time, know, death,...","[750, 623, 489, 406, 404, 340, 322, 303, 301, ..."
4,USA,CA,2020,0.049101,67099,15547.0,25725.0,25827.0,8,4,...,7427.0,2994.0,16295.0,3609.0,4067.0,3492.0,4602.0,4258.0,"[people, like, trump, time, need, day, home, k...","[5578, 3757, 3471, 3234, 2736, 2459, 2376, 232..."


In [None]:
df_usa_month = df.groupby(['country', 'year', 'month'], as_index=False).sum() 
df_usa_month['sentiment'] = df.groupby(['country', 'year', 'month'], as_index=False).agg({'sentiment': 'mean'})['sentiment']

lda = df.groupby(['country', 'year', 'month'])['Total'].size().reset_index(level=['country', 'year', 'month']).groupby(['country', 'year', 'month'])['Total'].idxmax().to_frame()
liste_lda = lda['Total'].values
df_usa_month['LDA'] = liste_lda
df_usa_month['LDA'] = df_state_month['LDA'].apply(str)

nmf = df.groupby(['country', 'year', 'month'])['Total'].size().reset_index(level=['country', 'year', 'month']).groupby(['country', 'year', 'month'])['Total'].idxmax().to_frame()
liste_nmf = nmf['Total'].values
df_usa_month['NMF'] = liste_nmf
df_usa_month['NMF'] = df_state_month['NMF'].apply(str)

df_words = pd.DataFrame() 
df_words[['words','freq']] = temp.groupby(['country', 'year', 'month']).sum().apply(lambda x: pd.Series([x.nlargest(10).index.values, x.nlargest(10).values]), axis=1)
df_words = df_words.reset_index(level=[0, 1, 2])
df_usa_month['words'] = df_words['words']
df_usa_month['freq'] = df_words['freq']

df_usa_month.head()

Unnamed: 0,country,year,month,sentiment,Total,Negative,Neutral,Positive,LDA,NMF,...,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,NMF_9,words,freq
0,USA,2020,1,0.021545,5857,1338.0,2616.0,1903.0,0,6,...,790.0,243.0,1222.0,354.0,404.0,129.0,432.0,364.0,"[wuhan, china, people, like, outbreak, case, k...","[963, 792, 524, 416, 283, 240, 231, 224, 217, ..."
1,USA,2020,2,0.031654,17705,4173.0,7212.0,6320.0,8,1,...,1240.0,639.0,3068.0,908.0,1252.0,1006.0,1171.0,1172.0,"[people, wuhan, china, like, trump, know, beer...","[1692, 1606, 1266, 1020, 996, 692, 691, 649, 6..."
2,USA,2020,3,0.05296,140635,31123.0,55732.0,53780.0,8,0,...,16811.0,6574.0,34733.0,7222.0,8324.0,5777.0,9207.0,9416.0,"[people, like, time, need, trump, home, day, k...","[11376, 7764, 6530, 6128, 5494, 5354, 5169, 50..."
3,USA,2020,4,0.059898,163976,36753.0,60762.0,66461.0,8,4,...,17606.0,7536.0,44047.0,9447.0,9576.0,9648.0,11058.0,9518.0,"[people, trump, like, time, need, pandemic, ne...","[12795, 9600, 8587, 8111, 7245, 6787, 6742, 62..."
4,USA,2020,5,0.062455,52656,11778.0,19170.0,21708.0,8,4,...,5117.0,2300.0,13731.0,3138.0,3140.0,2678.0,3721.0,3304.0,"[people, like, trump, day, new, death, time, n...","[4274, 2878, 2798, 2495, 2460, 2364, 2350, 214..."


In [None]:
df_usa_2020 = df.groupby(['country', 'year'], as_index=False).sum() 
df_usa_2020['sentiment'] = df.groupby(['country', 'year'], as_index=False).agg({'sentiment': 'mean'})['sentiment']

lda = df.groupby(['country', 'year'])['Total'].size().reset_index(level=['country', 'year']).groupby(['country', 'year'])['Total'].idxmax().to_frame()
liste_lda = lda['Total'].values
df_usa_2020['LDA'] = liste_lda
df_usa_2020['LDA'] = df_state_month['LDA'].apply(str)

nmf = df.groupby(['country', 'year'])['Total'].size().reset_index(level=['country', 'year']).groupby(['country', 'year'])['Total'].idxmax().to_frame()
liste_nmf = nmf['Total'].values
df_usa_2020['NMF'] = liste_nmf
df_usa_2020['NMF'] = df_state_month['NMF'].apply(str)

df_words = pd.DataFrame() 
df_words[['words','freq']] = temp.groupby(['country', 'year']).sum().apply(lambda x: pd.Series([x.nlargest(10).index.values, x.nlargest(10).values]), axis=1)
df_words = df_words.reset_index(level=[0, 1])
df_usa_2020['words'] = df_words['words']
df_usa_2020['freq'] = df_words['freq']

df_usa_2020.head()

Unnamed: 0,country,year,sentiment,Total,Negative,Neutral,Positive,LDA,NMF,LDA_0,...,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,NMF_9,words,freq
0,USA,2020,0.055786,380829,85165.0,145492.0,150172.0,0,6,59871.0,...,41564.0,17292.0,96801.0,21069.0,22696.0,19238.0,25589.0,23774.0,"[people, like, trump, time, need, day, new, kn...","[30661, 20665, 18965, 17735, 16163, 14567, 139..."


In [None]:
df_final = pd.concat([df_state_month, df_state_2020, df_usa_month, df_usa_2020])
df_final['state'] = df_final['state'].fillna('USA')
df_final['month'] = df_final['month'].fillna('00')
df_final

Unnamed: 0,country,state,year,month,sentiment,Total,Negative,Neutral,Positive,LDA,...,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,NMF_9,words,freq
0,USA,AK,2020,01,0.103704,9,1.0,4.0,4.0,0,...,2.0,0.0,2.0,1.0,3.0,0.0,1.0,0.0,"[people, zero, like, attention, bit, case, cas...","[4, 3, 2, 1, 1, 1, 1, 1, 1, 1]"
1,USA,AK,2020,02,-0.009299,32,10.0,13.0,9.0,8,...,5.0,0.0,2.0,3.0,2.0,1.0,0.0,3.0,"[flu, like, sick, zero, cdc, people, wuhan, da...","[7, 6, 6, 6, 4, 4, 4, 3, 3, 3]"
2,USA,AK,2020,03,0.076315,255,53.0,105.0,97.0,8,...,17.0,17.0,58.0,9.0,24.0,12.0,12.0,18.0,"[people, case, trump, day, american, new, thin...","[32, 21, 17, 16, 10, 10, 10, 9, 9, 8]"
3,USA,AK,2020,04,0.092931,306,70.0,87.0,149.0,8,...,16.0,11.0,93.0,21.0,12.0,18.0,24.0,20.0,"[know, need, like, time, day, death, people, w...","[22, 22, 20, 17, 16, 16, 15, 15, 14, 13]"
4,USA,AK,2020,05,0.072473,99,22.0,36.0,41.0,8,...,3.0,4.0,25.0,6.0,7.0,9.0,10.0,12.0,"[day, people, like, student, administration, k...","[12, 12, 10, 9, 8, 8, 8, 8, 7, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,USA,USA,2020,02,0.031654,17705,4173.0,7212.0,6320.0,8,...,1240.0,639.0,3068.0,908.0,1252.0,1006.0,1171.0,1172.0,"[people, wuhan, china, like, trump, know, beer...","[1692, 1606, 1266, 1020, 996, 692, 691, 649, 6..."
2,USA,USA,2020,03,0.052960,140635,31123.0,55732.0,53780.0,8,...,16811.0,6574.0,34733.0,7222.0,8324.0,5777.0,9207.0,9416.0,"[people, like, time, need, trump, home, day, k...","[11376, 7764, 6530, 6128, 5494, 5354, 5169, 50..."
3,USA,USA,2020,04,0.059898,163976,36753.0,60762.0,66461.0,8,...,17606.0,7536.0,44047.0,9447.0,9576.0,9648.0,11058.0,9518.0,"[people, trump, like, time, need, pandemic, ne...","[12795, 9600, 8587, 8111, 7245, 6787, 6742, 62..."
4,USA,USA,2020,05,0.062455,52656,11778.0,19170.0,21708.0,8,...,5117.0,2300.0,13731.0,3138.0,3140.0,2678.0,3721.0,3304.0,"[people, like, trump, day, new, death, time, n...","[4274, 2878, 2798, 2495, 2460, 2364, 2350, 214..."


In [None]:
states = {
    'AK': 'Alaska',
    'AL': 'Alabama',
    'AR': 'Arkansas',
    'AZ': 'Arizona',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DC': 'District of Columbia',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'IA': 'Iowa',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'MA': 'Massachusetts',
    'MD': 'Maryland',
    'ME': 'Maine',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MO': 'Missouri',
    'MS': 'Mississippi',
    'MT': 'Montana',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'NE': 'Nebraska',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NV': 'Nevada',
    'NY': 'New York',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VA': 'Virginia',
    'VT': 'Vermont',
    'WA': 'Washington',
    'WI': 'Wisconsin',
    'WV': 'West Virginia',
    'WY': 'Wyoming'
}

In [None]:
df_final.insert(2, 'state_name', 0)
df_final['state_name'] = df_final['state'].map(lambda x: states.get(x, x))
df_final.head()

Unnamed: 0,country,state,state_name,year,month,sentiment,Total,Negative,Neutral,Positive,...,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,NMF_9,words,freq
0,USA,AK,Alaska,2020,1,0.103704,9,1.0,4.0,4.0,...,2.0,0.0,2.0,1.0,3.0,0.0,1.0,0.0,"[people, zero, like, attention, bit, case, cas...","[4, 3, 2, 1, 1, 1, 1, 1, 1, 1]"
1,USA,AK,Alaska,2020,2,-0.009299,32,10.0,13.0,9.0,...,5.0,0.0,2.0,3.0,2.0,1.0,0.0,3.0,"[flu, like, sick, zero, cdc, people, wuhan, da...","[7, 6, 6, 6, 4, 4, 4, 3, 3, 3]"
2,USA,AK,Alaska,2020,3,0.076315,255,53.0,105.0,97.0,...,17.0,17.0,58.0,9.0,24.0,12.0,12.0,18.0,"[people, case, trump, day, american, new, thin...","[32, 21, 17, 16, 10, 10, 10, 9, 9, 8]"
3,USA,AK,Alaska,2020,4,0.092931,306,70.0,87.0,149.0,...,16.0,11.0,93.0,21.0,12.0,18.0,24.0,20.0,"[know, need, like, time, day, death, people, w...","[22, 22, 20, 17, 16, 16, 15, 15, 14, 13]"
4,USA,AK,Alaska,2020,5,0.072473,99,22.0,36.0,41.0,...,3.0,4.0,25.0,6.0,7.0,9.0,10.0,12.0,"[day, people, like, student, administration, k...","[12, 12, 10, 9, 8, 8, 8, 8, 7, 6]"


In [None]:
df_final

Unnamed: 0,country,state,state_name,year,month,sentiment,Total,Negative,Neutral,Positive,...,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,NMF_9,words,freq
0,USA,AK,Alaska,2020,01,0.103704,9,1.0,4.0,4.0,...,2.0,0.0,2.0,1.0,3.0,0.0,1.0,0.0,"[people, zero, like, attention, bit, case, cas...","[4, 3, 2, 1, 1, 1, 1, 1, 1, 1]"
1,USA,AK,Alaska,2020,02,-0.009299,32,10.0,13.0,9.0,...,5.0,0.0,2.0,3.0,2.0,1.0,0.0,3.0,"[flu, like, sick, zero, cdc, people, wuhan, da...","[7, 6, 6, 6, 4, 4, 4, 3, 3, 3]"
2,USA,AK,Alaska,2020,03,0.076315,255,53.0,105.0,97.0,...,17.0,17.0,58.0,9.0,24.0,12.0,12.0,18.0,"[people, case, trump, day, american, new, thin...","[32, 21, 17, 16, 10, 10, 10, 9, 9, 8]"
3,USA,AK,Alaska,2020,04,0.092931,306,70.0,87.0,149.0,...,16.0,11.0,93.0,21.0,12.0,18.0,24.0,20.0,"[know, need, like, time, day, death, people, w...","[22, 22, 20, 17, 16, 16, 15, 15, 14, 13]"
4,USA,AK,Alaska,2020,05,0.072473,99,22.0,36.0,41.0,...,3.0,4.0,25.0,6.0,7.0,9.0,10.0,12.0,"[day, people, like, student, administration, k...","[12, 12, 10, 9, 8, 8, 8, 8, 7, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,USA,USA,USA,2020,02,0.031654,17705,4173.0,7212.0,6320.0,...,1240.0,639.0,3068.0,908.0,1252.0,1006.0,1171.0,1172.0,"[people, wuhan, china, like, trump, know, beer...","[1692, 1606, 1266, 1020, 996, 692, 691, 649, 6..."
2,USA,USA,USA,2020,03,0.052960,140635,31123.0,55732.0,53780.0,...,16811.0,6574.0,34733.0,7222.0,8324.0,5777.0,9207.0,9416.0,"[people, like, time, need, trump, home, day, k...","[11376, 7764, 6530, 6128, 5494, 5354, 5169, 50..."
3,USA,USA,USA,2020,04,0.059898,163976,36753.0,60762.0,66461.0,...,17606.0,7536.0,44047.0,9447.0,9576.0,9648.0,11058.0,9518.0,"[people, trump, like, time, need, pandemic, ne...","[12795, 9600, 8587, 8111, 7245, 6787, 6742, 62..."
4,USA,USA,USA,2020,05,0.062455,52656,11778.0,19170.0,21708.0,...,5117.0,2300.0,13731.0,3138.0,3140.0,2678.0,3721.0,3304.0,"[people, like, trump, day, new, death, time, n...","[4274, 2878, 2798, 2495, 2460, 2364, 2350, 214..."


In [None]:
df_final.to_csv('/content/drive/MyDrive/Colab Notebooks/M2_MLSD/PPD/data_map.csv',index=False)