# Sentiment and Topic Analysis

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.chdir('/content/drive/My Drive/BSE/TextMining')

## Preprocessing

In [None]:
!pip install emoji

In [None]:
import numpy as np
import pandas as pd
import emoji
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer   
from nltk.stem.wordnet import WordNetLemmatizer
import re

import math
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
stopwords_eng =stopwords.words("english")
stopwords_es =stopwords.words("spanish")
stops = stopwords_eng + stopwords_es

def cleanTweets(s):
    #Function to clean tweets, for now i am keeping emojis and hashtags. Alternative version
    if type(s)==np.float:
        return ""
    #Demojize text
    s=emoji.demojize(s)
    
    #Remove new lines, etc.
    s = s.replace(r'<lb>', "\n")
    s = s.replace(r'<tab>', "\i")
    s = re.sub(r'<br */*>', "\n", s)
    s = s.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
    s = s.replace("&amp;", "&")
    s = s.replace("\n", " ")
    
    # markdown urls
    s = re.sub(r'\(https*://[^\)]*\)', "", s)
    # normal urls
    s = re.sub(r'https*://[^\s]*', "", s)
    #s = re.sub(r'_+', ' ', s)
    s = re.sub(r'"+', '"', s)
    #Remove punctuation    
    s = re.sub('[()!?]', ' ', s)
    s = re.sub('\[.*?\]',' ', s)
    # custom removals
    s = re.sub(r'@[A-Za-z0-9_]+', "", s) # replace mentions
    s = re.sub(r':[^:]+','',s) # remove demojized text
    
    #Split multiword hashtags into individual words - they could contain spin
    #s = re.sub(r'#(\w+)', rep, s) # split hashtags
    s = re.sub(r'#[A-Za-z0-9_]+', "", s) # remove hashtags
    
    s = re.sub(r'[0-9]','',s) #remove digits
    
    s=s.lower()
    
    #Remove stopwords
    s=s.split()
    s= [w for w in s if not s in stops]
    
    
    s=" ".join(word for word in s)
    
    return str(s)


stemmer_eng=SnowballStemmer("english")
stemmer_es=SnowballStemmer("spanish")
lem = WordNetLemmatizer()

def stem_lematize(s, modulation):
    tokens = re.split(r'\W+', s)
    stems = []
    for token in tokens:
        if modulation==1:
            eng=stemmer_es.stem(token)
            stems.append(stemmer_es.stem(eng))
        if modulation==2:
            stems.append(lem.lemmatize(token))
        if modulation==0:
            stems.append(tokens)
    s=" ".join(word for word in stems)
    return s

In [None]:
#corpus_data=pd.read_csv("Data/all_tweets_filtered.csv")

In [None]:
df=pd.read_csv("all_tweets_filtered_final.csv")

In [None]:
df['cleaned_text'] = [cleanTweets(text) for text in df['text']]

In [None]:
df['stem_text'] = [stem_lematize(text, 1) for text in df['cleaned_text']]

In [None]:
df.to_csv('all_tweets_stem.csv')

In [None]:
df['cleaned_text']

In [None]:
df['stem_text']

# Sentiment Analysis & LDA

In [None]:
#Import Data
#df=pd.read_csv("Data/all_tweets_preprocessed.csv") 

In [None]:
df.info()

## Sentiment Analysis Method 1

In [None]:
!pip install sentiment-analysis-spanish

In [None]:
df['cleaned_text'] = df['cleaned_text'].astype(str)

In [None]:
from sentiment_analysis_spanish import sentiment_analysis

def get_sentiment(data,column):
    output=[]
    sent=sentiment_analysis.SentimentAnalysisSpanish()
    for index, row in data.iterrows():
        score=sent.sentiment(row[column])
        output.append(score) 
    data["sentiment"]=output  #Output between 0 and 1, low numbers negative, high numbers positive 
    return data

df_1=get_sentiment(df,"cleaned_text")

In [None]:
df_1.groupby(["Country","Categorie","Time"])["sentiment"].mean()

In [None]:
df_1[df_1["sentiment"]>0.8]

In [None]:
df_1.to_csv('all_tweets_sentiment_1.csv')

In [None]:
#Plot sentiment by day to check for date impact

In [None]:
df_all=pd.read_csv("G:/My Drive/BSE/TextMining/all_tweets_sentiment_1.csv")

In [None]:
df_all.info()

In [None]:
#create mapping table, 1 row per author_id
map_table = df_all.drop_duplicates(subset=['author_id'])

In [None]:
df_all=df_all.rename(columns={"Days since":"days_since","Day Protest":"day_protest"})

In [None]:
sns.set_palette("icefire")
sns.set(rc={'figure.figsize':(10,7)})
sent=sns.lineplot(data=df_all,x="days_since",y="sentiment",hue="Country")

sent.set_title( "Sentiment comparison before, during and after the protest")
sent.set_xlabel( "Timeline in Days (0: Start Protest)")
sent.set_ylabel( "Sentiment Score (0: Negative, 1: Positive)")

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
sns.lineplot(data=df_all,x="days_since",y="sentiment",hue="Country",style="Categorie")

In [None]:
sns.set()
sns.set(rc={'figure.figsize':(15,7)})
#define plotting region (1 row, 2 columns)
fig, axes = plt.subplots(1, 2)

#create boxplot in each subplot
pl1=sns.lineplot(data=df_all[df_all["Country"]=="Chile"],x="days_since",y="sentiment",hue="Categorie",ax=axes[0])
pl2=sns.lineplot(data=df_all[df_all["Country"]=="Colombia"],x="days_since",y="sentiment",hue="Categorie",ax=axes[1])

pl1.set_title( "Sentiment in Chile")
pl1.set_xlabel( "Timeline in Days (0: Start Protest)")
pl1.set_ylabel( "Sentiment Score (0: Negative, 1: Positive)")
pl2.set_title( "Sentiment in Colombia")
pl2.set_xlabel( "Timeline in Days (0: Start Protest)")
pl2.set_ylabel( "Sentiment Score (0: Negative, 1: Positive)")

**ANOVA**

In [None]:
df=df_all[df_all["Country"]=="Chile"]

In [None]:
from scipy import stats

keys = list(df.Time.unique())

values = []
for engine in keys:
    values.append(list(df.loc[df['Time'] == engine, 'sentiment']))

data = dict(zip(keys, values))

# stats f_oneway functions takes the groups as input and returns F and P-value
fvalue, pvalue = stats.f_oneway(data['Before'],
                                data['During'], 
                                data['After'])

print(f"Results of ANOVA test:\n The F-statistic is: {fvalue}\n The p-value is: {pvalue}")
#Highly significant

from statsmodels.stats.multicomp import pairwise_tukeyhsd

# perform multiple pairwise comparison (Tukey HSD)
m_comp = pairwise_tukeyhsd(endog=df['sentiment'], groups=df['Time'], alpha=0.05)
print(m_comp)
#All are different

In [None]:
df=df_all[df_all["Country"]=="Colombia"]

from scipy import stats

keys = list(df.Time.unique())

values = []
for engine in keys:
    values.append(list(df.loc[df['Time'] == engine, 'sentiment']))

data = dict(zip(keys, values))

# stats f_oneway functions takes the groups as input and returns F and P-value
fvalue, pvalue = stats.f_oneway(data['Before'],
                                data['During'], 
                                data['After'])

print(f"Results of ANOVA test:\n The F-statistic is: {round(fvalue,2)}\n The p-value is: {round(pvalue,2)}")
#Highly significant

from statsmodels.stats.multicomp import pairwise_tukeyhsd

# perform multiple pairwise comparison (Tukey HSD)
m_comp = pairwise_tukeyhsd(endog=df['sentiment'], groups=df['Time'], alpha=0.05)
print(m_comp)
#All are different

**Right vs. Left**

In [None]:
politicians=df_all[df_all["Categorie"]=="Politician"]

In [None]:
sns.set()

#define plotting region (1 row, 2 columns)
fig, axes = plt.subplots(1, 2)

#create boxplot in each subplot
sns.lineplot(data=politicians[politicians["Country"]=="Colombia"],x="days_since",y="sentiment",hue="Left/Right",ax=axes[0])
sns.lineplot(data=politicians[politicians["Country"]=="Chile"],x="days_since",y="sentiment",hue="Left/Right",ax=axes[1])

In [None]:
politicians.groupby(["Country","Time","Left/Right"])["sentiment"].mean()

## Sentiment Analysis Method 2

**Pysentimiento - Getting the Data**

In [None]:
!pip install pysentimiento

In [None]:
df_1=pd.read_csv("all_tweets_sentiment_1.csv")

In [None]:
df_1['cleaned_text'] = df_1['cleaned_text'].astype(str)

In [None]:
df_1['text'] = df_1['text'].astype(str)

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
print(df_1["Days since"].value_counts()) #0,5,-5,195

In [None]:
#Filter on four days for analysis
df_test=df_1[df_1["Days since"].isin([-5,0,5,195])]

In [None]:
df_test["Days since"].value_counts()

In [None]:
from pysentimiento import create_analyzer
from pysentimiento.preprocessing import preprocess_tweet

In [None]:
import time

In [None]:
#Without preprocessing
def get_sentiment_np(data,column):
    start_time = time.time()
    negative=[]
    analyzer = create_analyzer(task="sentiment", lang="es")
    for index, row in data.iterrows():
        score=analyzer.predict(row[column])
        neg=score.probas["NEG"]
        negative.append(neg)
    data["sentiment_neg"]=negative
    print("--- %s seconds ---" % (time.time() - start_time))
    return data

In [None]:
df_test=get_sentiment_np(df_test,"cleaned_text")

In [None]:
df_test.to_csv('subset_sentiment.csv')

In [None]:
df_test.groupby(["Country","Categorie","Days since"])["sentiment_neg"].mean()

In [None]:
pd.options.display.max_colwidth = 500

In [None]:
df_2[df_2["sentiment_neg"]<0.05]["text"]

In [None]:
def get_hate_speech(data,column):
    hate=[]
    hate_speech_analyzer = create_analyzer(task="hate_speech", lang="es")
    for index, row in data.iterrows():
        text=preprocess_tweet(row[column])
        score=hate_speech_analyzer.predict(text)
        ha_score=(score.probas["hateful"]+score.probas["aggressive"])/2
        hate.append(ha_score)
    data["hate_speech"]=hate
    return data

In [None]:
df_test=get_hate_speech(df_test,"text")
df_test.to_csv('subset_sentiment.csv')

In [None]:
df_test.groupby(["Country","Categorie","Days since"])["hate_speech"].mean()

In [None]:
df_test["hate_speech"]

In [None]:
def get_emotions(data,column):
    
    emotions=["fear","surprise","joy","disgust","sadness","anger","others"]
    fear=[]
    surprise=[]
    joy=[]
    disgust=[]
    sadness=[]
    anger=[]
    others=[]
    
    emotion_analyzer = create_analyzer(task="emotion", lang="en")
    for index, row in data.iterrows():
        text=preprocess_tweet(row[column])
        score=emotion_analyzer.predict(text)
        fear.append(score.probas["fear"])
        surprise.append(score.probas["surprise"])
        joy.append(score.probas["joy"])
        disgust.append(score.probas["disgust"])
        sadness.append(score.probas["sadness"])
        anger.append(score.probas["anger"])
        others.append(score.probas["others"])
        
    data['fear']=fear
    data['surprise']=surprise
    data['joy']=joy
    data['disgust']=disgust
    data['sadness']=sadness
    data['anger']=anger
    data['others']=others
    
    return data

In [None]:
df_test=get_emotions(df_test,"text")
df_test.to_csv('subset_sentiment.csv')

In [None]:
df_test.groupby(["Country","Categorie","Days since"])[["sadness","joy","fear","disgust","anger","surprise","others"]].mean()

**Analyze Data**

In [None]:
sentiment=pd.read_csv("C:/Users/Stefan Hoeller/Documents/BSE/TextMining/FinalProjectTextMining/Data/subset_sentiment.csv")

In [None]:
sentiment.info()

In [None]:
sns.color_palette("icefire", as_cmap=True)
sns.set_palette("icefire")

In [None]:
# set as Categorical and Categories as strings
sentiment.sort_values('Days since', inplace=True, ascending=True)
sentiment["Timeline"] = sentiment["Days since"].astype(str)


# plot
fig, ax = plt.subplots(figsize = (16,8))
p1 = sns.lineplot(ax=ax, x='Timeline', y='sentiment_neg', data=sentiment, hue='Country')

In [None]:
# plot
fig, ax = plt.subplots(figsize = (16,8))
p1 = sns.lineplot(ax=ax, x='Timeline', y='sentiment_neg', data=sentiment, hue='Country',style="Categorie")

In [None]:
# plot
fig, ax = plt.subplots(figsize = (16,8))
p1 = sns.lineplot(ax=ax, x='Timeline', y='sentiment_neg', data=sentiment, hue='Country',style="Left/Right")

In [None]:
plot = sns.factorplot(x='Country', y='sentiment_neg', hue='Timeline', data=sentiment, kind='bar')
#plot.set_ylabel( "Negativity of the Tweet")
plot.set(xlabel='Country', ylabel='Negativity of the Tweet')

In [None]:
g = sns.FacetGrid(sentiment, row="Country", col="Categorie", hue="Timeline", height=5,aspect=1)
g.map(sns.barplot, "Timeline", "sentiment_neg",order=["-5", "0","5","195"])
g.add_legend()

**Hatespeech**

In [None]:
# plot disgust, anger, sadness, joy
fig, ax = plt.subplots(figsize = (16,8))
p1 = sns.lineplot(ax=ax, x='Timeline', y='disgust', data=sentiment, hue='Country')

In [None]:
# plot
fig, ax = plt.subplots(figsize = (10,5))
p1 = sns.lineplot(ax=ax, x='Timeline', y='hate_speech', data=sentiment, hue='Country',style="Categorie")
p1.set_ylabel("% of Tweets that contain Hate Speech")

In [None]:
# plot
fig, ax = plt.subplots(figsize = (16,8))
p1 = sns.lineplot(ax=ax, x='Timeline', y='hate_speech', data=sentiment, hue='Country',style="Left/Right")

**Emotions**

In [None]:
chile_sent=sentiment[sentiment["Country"]=="Chile"]
col_sent=sentiment[sentiment["Country"]=="Colombia"]

In [None]:
plot = sns.factorplot(x='Country', y='sentiment_neg', hue='Timeline', data=chile_sent, kind='bar')
#plot.set_ylabel( "Negativity of the Tweet")
plot.set(xlabel='Country', ylabel='Negativity of the Tweet')

In [None]:
g = sns.FacetGrid(sentiment, row="Country", col="Categorie", hue="Timeline", height=5,aspect=1)
g.map(sns.barplot, "Timeline", "sentiment_neg",order=["-5", "0","5","195"])
g.add_legend()

In [None]:
# plot
fig, ax = plt.subplots(figsize = (16,8))
p1 = sns.lineplot(ax=ax, x='Timeline', y='disgust', data=chile_sent, color='red')
p1 = sns.lineplot(ax=ax, x='Timeline', y='anger', data=chile_sent, color='blue')
p1 = sns.lineplot(ax=ax, x='Timeline', y='sadness', data=chile_sent, color='green")
p1 = sns.lineplot(ax=ax, x='Timeline', y='joy', data=chile_sent, color='black')

In [None]:
# plot
fig, ax = plt.subplots(figsize = (16,8))
p1 = sns.lineplot(ax=ax, x='Timeline', y='disgust', data=col_sent, color='red')
p1 = sns.lineplot(ax=ax, x='Timeline', y='anger', data=col_sent, color='blue')
p1 = sns.lineplot(ax=ax, x='Timeline', y='sadness', data=col_sent, color='green')
p1 = sns.lineplot(ax=ax, x='Timeline', y='joy', data=col_sent, color='black')

In [None]:
# plot
fig, ax = plt.subplots(figsize = (16,8))
p1 = sns.lineplot(ax=ax, x='Timeline', y='disgust', data=col_sent, color='red',style="Left/Right")
p1 = sns.lineplot(ax=ax, x='Timeline', y='anger', data=col_sent, color='blue',style="Left/Right")
p1 = sns.lineplot(ax=ax, x='Timeline', y='sadness', data=col_sent, color='green',style="Left/Right")
p1 = sns.lineplot(ax=ax, x='Timeline', y='joy', data=col_sent, color='black',style="Left/Right")

In [None]:
fig, ax = plt.subplots(figsize = (16,8))
p1 = sns.lineplot(ax=ax, x='Timeline', y='disgust', data=chile_sent, color='red',style="Left/Right")
p1 = sns.lineplot(ax=ax, x='Timeline', y='anger', data=chile_sent, color='blue',style="Left/Right")
p1 = sns.lineplot(ax=ax, x='Timeline', y='sadness', data=chile_sent, color='green',style="Left/Right")
p1 = sns.lineplot(ax=ax, x='Timeline', y='joy', data=chile_sent, color='black',style="Left/Right")

In [None]:
# plot
fig, ax = plt.subplots(figsize = (16,8))
p1 = sns.lineplot(ax=ax, x='Timeline', y='disgust', data=col_sent, color='red',style="Categorie")
p1 = sns.lineplot(ax=ax, x='Timeline', y='anger', data=col_sent, color='blue',style="Categorie")
p1 = sns.lineplot(ax=ax, x='Timeline', y='sadness', data=col_sent, color='green',style="Categorie")
p1 = sns.lineplot(ax=ax, x='Timeline', y='joy', data=col_sent, color='black',style="Categorie")

In [None]:
# plot
fig, ax = plt.subplots(figsize = (16,8))
p1 = sns.lineplot(ax=ax, x='Timeline', y='disgust', data=chile_sent, color='red',style="Categorie")
p1 = sns.lineplot(ax=ax, x='Timeline', y='anger', data=chile_sent, color='blue',style="Categorie")
p1 = sns.lineplot(ax=ax, x='Timeline', y='sadness', data=chile_sent, color='green',style="Categorie")
p1 = sns.lineplot(ax=ax, x='Timeline', y='joy', data=chile_sent, color='black',style="Categorie")

## LDA

### Unguided LDA

In [None]:
#Split Data
chile=df_all[df_all["Country"]=="Chile"]
colombia=df_all[df_all["Country"]=="Colombia"]

In [None]:
# reproducibility
seed = 42
# python RNG
import random
random.seed(seed)
# numpy RNG
import numpy as np
np.random.seed(seed)

In [None]:
def agg(df):
    # concatenate the string
    df['cleaned_text'] = df['cleaned_text'].astype(str)
    subset=df[['author_id','Time','cleaned_text']]

    #subset.groupby(['author_id',"Time"]).agg({'cleaned_text': ' '.join})
    df_agg=subset.groupby(['author_id',"Time"]).agg({'cleaned_text': ['. '.join, 'count']})
    df_agg.columns = df_agg.columns.droplevel(0)
    df_agg.columns=["joined_text","count_tweets"]
    return df_agg

**Chile Unguided**

In [None]:
chile_agg=agg(chile)

In [None]:
chile_agg

In [None]:
import gensim
from gensim import models
from gensim import corpora
from gensim.utils import simple_preprocess
from collections import defaultdict

In [None]:
text=chile_agg['joined_text']

In [None]:
from termcolor import colored, cprint
from nltk.stem import SnowballStemmer   
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

porter=SnowballStemmer("spanish")
lmtzr = WordNetLemmatizer()

stop_words = set(stopwords.words('spanish'))


def abbr_or_lower(word):
    if re.match('([A-Z]+[a-z]*){2,}', word):
        return word
    else:
        return word.lower()

def tokenize(words, modulation):
    tokens = re.split(r'\W+', words)
    stems = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        lowers=abbr_or_lower(token)
        if lowers not in stop_words:
            if re.search('[a-zA-Z]', lowers):
                if modulation==1:
                    stems.append(porter.stem(lowers))
                if modulation==2:
                    stems.append(lmtzr.lemmatize(lowers))
                if modulation==0:
                    stems.append(lowers)
                #stems.append(" ")
    return stems

In [None]:
texts=[tokenize(document,0) for document in text]

In [None]:
#making bigrams and trigrams
bigram = gensim.models.Phrases(texts, min_count=1, threshold=1)
trigram = gensim.models.Phrases(bigram[texts], threshold=1)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

texts_bigrams = [bigram_mod[doc] for doc in texts]
texts_trigrams = [trigram_mod[doc] for doc in texts]

In [None]:
texts=texts_trigrams


frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

#pprint(texts)

dictionary = corpora.Dictionary(texts)
print(dictionary)

dictionary.filter_extremes(no_below = 20, no_above=0.25)
# we are getting rid of things like "said" with the no_above threshold (depending on stemming more will be thrown out)
print(dictionary)

corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
#estimate the lda model with two topics
topicnumber=8

# Tuning: alpha='symmetric', eta=None

#alpha default is ’symmetric’: Uses a fixed symmetric prior of 1.0 / num_topics. In our case that is 0.1.
#Change alpha if wanted
lda=models.LdaModel(corpus, id2word=dictionary, num_topics=topicnumber, random_state=42,eta=0.001)

lda.print_topics(topicnumber,20)

In [None]:
get_document_topics = [lda.get_document_topics(item, minimum_probability=0.000000001) for item in corpus]

#note you could also use this vector = [lda[item] for item in corpus] but it would take into account the prior

vector = [lda[item] for item in corpus]

#this produces a lits of tuples
print(get_document_topics[0:1])
print(vector[0:1])

**Colombia Unguided**

In [None]:
col_agg=agg(colombia)

In [None]:
text=col_agg['joined_text']

In [None]:
texts=[tokenize(document,0) for document in text]

In [None]:
#making bigrams and trigrams
bigram = gensim.models.Phrases(texts, min_count=1, threshold=1)
trigram = gensim.models.Phrases(bigram[texts], threshold=1)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

texts_bigrams = [bigram_mod[doc] for doc in texts]
texts_trigrams = [trigram_mod[doc] for doc in texts]

In [None]:
texts=texts_trigrams

frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

#pprint(texts)

dictionary = corpora.Dictionary(texts)
print(dictionary)

dictionary.filter_extremes(no_below = 20, no_above=0.25)
# we are getting rid of things like "said" with the no_above threshold (depending on stemming more will be thrown out)
print(dictionary)

corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
#estimate the lda model with two topics
topicnumber=8

# Tuning: alpha='symmetric', eta=None

#alpha default is ’symmetric’: Uses a fixed symmetric prior of 1.0 / num_topics. In our case that is 0.1.
#Change alpha if wanted
lda=models.LdaModel(corpus, id2word=dictionary, num_topics=topicnumber, random_state=42,eta=0.001)

lda.print_topics(topicnumber,20)

Unguided LDA is not working well

### Guided LDA

**Chile**

In [None]:
text=chile_agg['joined_text']

In [None]:
texts=[tokenize(document,1) for document in text]

In [None]:
#making bigrams and trigrams
bigram = gensim.models.Phrases(texts, min_count=1, threshold=1)
trigram = gensim.models.Phrases(bigram[texts], threshold=1)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

texts_bigrams = [bigram_mod[doc] for doc in texts]
texts_trigrams = [trigram_mod[doc] for doc in texts]

In [None]:
texts=texts_trigrams


frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

#pprint(texts)

dictionary = corpora.Dictionary(texts)
print(dictionary)

dictionary.filter_extremes(no_below = 20, no_above=0.25)
# we are getting rid of things like "said" with the no_above threshold (depending on stemming more will be thrown out)
print(dictionary)

corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
def create_eta(priors, etadict, ntopics):
    eta = np.full(shape=(ntopics, len(etadict)), fill_value=0.0001) # create a (ntopics, nterms) matrix and fill with low number
    for word, topic in priors.items(): # for each word in the list of priors
        keyindex = [index for index,term in etadict.items() if term==word] # look up the word in the dictionary
        if (len(keyindex)>0): # if it's in the dictionary
            eta[topic,keyindex[0]] = 500  # put a large number in there
    return eta

In [None]:
#Input for ETA
apriori_original = {
    "policia":0,"carabinero":0,"violencia":0,"humanos":0,"protesta":0,"detención":0,"desaparecido":0,"paco":0,"quemar":0,"fuego":0,"barricada":0,"INDH":0, "represion":0, "lacrimogeno":0,
                                                 "perdigones":0, "ojo":0, "ciego":0, "mutilado":0,"ddhh":0,"dd hh":0, 'pensión':1,'AFP':1,'vejez':1,'salud':1,'fonasa':1,'consultorio':1,'hospital':1,'clínica':1,'isapre':1,'dignidad':1,'educación':1,'profesor':1,'liceo':1,'universidad':1,'gratuita':1,
                                      'sociales':1,'vivienda':1,'campamentos':1, 'delincuencia':1,
                     'corrupción':2,'soborno':2,'vendido':2,'nepotismo':2,'deshonesto':2,'ladrón':2,'robar':2,'mentir':2,
                     'metro':3,'tren':3,'pasaje':3,'transporte':3,'evadir':3,'evasión':3,'micro':3,'transantiago':3,'Hutt':3,'alza':3,'tarifa':3
}

In [None]:
porter=SnowballStemmer("spanish")
chile_dict_adj={}
for key, value in apriori_original.items():
    a=porter.stem(key.lower())
    chile_dict_adj[a]= value

In [None]:
eta = create_eta(chile_dict_adj, dictionary, 10)

In [None]:
topicnumber=10

#alpha default is ’symmetric’: Uses a fixed symmetric prior of 1.0 / num_topics. In our case that is 0.1.

lda=models.LdaModel(corpus, id2word=dictionary, num_topics=topicnumber, 
                    random_state=42, eta=eta, alpha='symmetric')

lda.print_topics(topicnumber,20)

In [None]:
topics_chile = [lda.get_document_topics(item, minimum_probability=0.00000001) for item in corpus]

In [None]:
#note you could also use this vector = [lda[item] for item in corpus] but it would take into account the prior
vector = [lda[item] for item in corpus]

In [None]:
#this produces a lits of tuples
print(get_document_topics[0:1])
print(vector[0:1])

In [None]:
for topics in range(0,10):
    chile_agg["topic_"+str(topics)]=[x[topics][1] for x in topics_chile]

In [None]:
chile_agg=chile_agg.reset_index(level=['author_id', 'Time'])

In [None]:
chile_agg=chile_agg.merge(map_table[["author_id","Categorie","Left/Right","author.username"]],how="left",left_on="author_id",right_on="author_id")

**Human Rights are a huge deal, for the left more than the right, Social Issues are huge on politicians with the Right picking up**

In [None]:
sns.set()

#define plotting region (1 row, 2 columns)
fig, axes = plt.subplots(1, 4)

#create boxplot in each subplot
p1=sns.barplot(data=chile_agg,x="Time",y="topic_0",order=["Before","During","After"],hue="Categorie",ax=axes[0])
p2=sns.barplot(data=chile_agg,x="Time",y="topic_1",order=["Before","During","After"],hue="Categorie",ax=axes[1])
p3=sns.barplot(data=chile_agg,x="Time",y="topic_2",order=["Before","During","After"],hue="Categorie",ax=axes[2])
p4=sns.barplot(data=chile_agg,x="Time",y="topic_3",order=["Before","During","After"],hue="Categorie",ax=axes[3])
p1.set_ylabel("Share Human Rights Violation")
p2.set_ylabel("Share Social Issues")
p3.set_ylabel("Share Corruption")
p4.set_ylabel("Share Protest Starter")
p1.set_xlabel("Human Rights Violation")
p2.set_xlabel("Social Issues")
p3.set_xlabel("Corruption")
p4.set_xlabel("Protest Starter")

In [None]:
chile_agg.groupby(["Categorie","Time"])["topic_0"].mean()

In [None]:
chile_agg.groupby(["Categorie","Time"])["topic_1"].mean()

In [None]:
chile_agg.groupby(["Categorie","Time"])["topic_2"].mean()

In [None]:
chile_agg.groupby(["Categorie","Time"])["topic_3"].mean()

In [None]:
chile_agg.groupby(["Left/Right","Time"])["topic_0"].mean()

In [None]:
chile_agg.groupby(["Left/Right","Time"])["topic_1"].mean()

In [None]:
chile_agg.groupby(["Left/Right","Time"])["topic_2"].mean()

In [None]:
chile_agg.groupby(["Left/Right","Time"])["topic_3"].mean()

**Colombia**

In [None]:
text=col_agg['joined_text']

In [None]:
texts=[tokenize(document,1) for document in text]

In [None]:
#making bigrams and trigrams
bigram = gensim.models.Phrases(texts, min_count=1, threshold=1)
trigram = gensim.models.Phrases(bigram[texts], threshold=1)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

texts_bigrams = [bigram_mod[doc] for doc in texts]
texts_trigrams = [trigram_mod[doc] for doc in texts]

In [None]:
texts=texts_trigrams


frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

#pprint(texts)

dictionary = corpora.Dictionary(texts)
print(dictionary)

dictionary.filter_extremes(no_below = 20, no_above=0.25)
# we are getting rid of things like "said" with the no_above threshold (depending on stemming more will be thrown out)
print(dictionary)

corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
def create_eta(priors, etadict, ntopics):
    eta = np.full(shape=(ntopics, len(etadict)), fill_value=0.0001) # create a (ntopics, nterms) matrix and fill with low number
    for word, topic in priors.items(): # for each word in the list of priors
        keyindex = [index for index,term in etadict.items() if term==word] # look up the word in the dictionary
        if (len(keyindex)>0): # if it's in the dictionary
            eta[topic,keyindex[0]] = 500  # put a large number in there
    return eta

In [None]:
apriori_col={
    "policia":0,"esmad":0,"violencia":0,"humanos":0,"protesta":0,"detención":0,"desaparecido":0,"victima":0,"pistola":0,"bomba":0,"gases":0,"lacrimogeno":0, "represion":0, "manifestacion":0,  "violacion":0, "lesion":0, "sangre":0, "motin":0, "proyectiles":0, "tanqueta":0, "orden":0, "perdigones":0, "ojo":0, "ciego":0, "piedra":0, "antidisturbio":0, "disturbio":0,"arma":0, "trauma":0, "fuerza":0,'pension':1,'narcotrafico':1,'vejez':1,'salud':1,'drogas':1,'paramilitares':1,'paraco':1,'guerrilla':1,'farc':1,'educacion':1,'pobreza':1,'educacion':1, 'publica':1, 'gratuita':1,'sociales':1,'vivienda':1,'desigualdad':1, 'venezolanos':1, 'venezuela':1, 'conflicto':1, 'reclutamiento':1, 'desigualdad':1, 'alimentos':1, 'desempleo':1, 
                                         'delincuencia':1, 'crimen':1, 'bandas':1, 'terrorismo':1, 'robos':1, 'atracos':1, 'discriminacion':1, 'ilegal':1, 'populismo':1, 'transporte':1, 'trancon':1, 'movilidad':1, 'corrupcion':2, 'parapolitica':2,'vendido':2,'deshonesto':2,'ladrón':2,'robar':2,'mentir':2, 'recursos':2, 'contraloria':2, 'fiscalia':2, 'presupuesto':2, 'politicos':2, 'ratas':2, 
                                      'elecciones':2, 'escandalo':2, 'odebrecht':2, 'contratacion':2, 'mermelada':2, 'votos':2, 'sic':2, 'procuraduria':2, 'ñeñepolitica':2, 'carrusel':2, 'dinero':2,"reforma":3, "impuestos":3, "pobreza":3, "canasta":3, "precio":3, "carrasquilla":3, "ipc":3, "estrato":3,  "tributaria":3, "solaridad":3, "sostenible":3, "ley":3, 
                                           "desempleo":3, "clase":3, "recaudo":3, "deuda":3, "renta":3, "iva":3, "servicios":3, "patrimonio":3, "finanzas":3, "inversion":3, "exenciones":3, "economia":3, "ahorro":3, "recursos":3
}

In [None]:
porter=SnowballStemmer("spanish")
col_dict_adj={}
for key, value in apriori_col.items():
    a=porter.stem(key.lower())
    col_dict_adj[a]= value

In [None]:
eta = create_eta(col_dict_adj, dictionary, 10)

In [None]:
topicnumber=10

#alpha default is ’symmetric’: Uses a fixed symmetric prior of 1.0 / num_topics. In our case that is 0.1.

lda=models.LdaModel(corpus, id2word=dictionary, num_topics=topicnumber, 
                    random_state=42, eta=eta, alpha='symmetric')

lda.print_topics(topicnumber,20)

In [None]:
topics_col = [lda.get_document_topics(item, minimum_probability=0.00000001) for item in corpus]

In [None]:
for topics in range(0,10):
    col_agg["topic_"+str(topics)]=[x[topics][1] for x in topics_col]

In [None]:
col_agg=col_agg.reset_index(level=['author_id', 'Time'])

In [None]:
col_agg=col_agg.merge(map_table[["author_id","Categorie","Left/Right","author.username"]],how="left",left_on="author_id",right_on="author_id")

In [None]:
sns.set()

#define plotting region (1 row, 2 columns)
fig, axes = plt.subplots(1, 4)

#create boxplot in each subplot
p1=sns.barplot(data=col_agg,x="Time",y="topic_0",order=["Before","During","After"],hue="Categorie",ax=axes[0])
p2=sns.barplot(data=col_agg,x="Time",y="topic_1",order=["Before","During","After"],hue="Categorie",ax=axes[1])
p3=sns.barplot(data=col_agg,x="Time",y="topic_2",order=["Before","During","After"],hue="Categorie",ax=axes[2])
p4=sns.barplot(data=col_agg,x="Time",y="topic_3",order=["Before","During","After"],hue="Categorie",ax=axes[3])
p1.set_ylabel("Share Human Rights Violation")
p2.set_ylabel("Share Social Issues")
p3.set_ylabel("Share Corruption")
p4.set_ylabel("Share Protest Starter")
p1.set_xlabel("Human Rights Violation")
p2.set_xlabel("Social Issues")
p3.set_xlabel("Corruption")
p4.set_xlabel("Protest Starter")

In [None]:
col_agg.groupby(["Categorie","Time"])["topic_0"].mean()

In [None]:
col_agg.groupby(["Categorie","Time"])["topic_1"].mean()

In [None]:
col_agg.groupby(["Categorie","Time"])["topic_2"].mean()

In [None]:
col_agg.groupby(["Categorie","Time"])["topic_3"].mean()

In [None]:
col_agg.groupby(["Left/Right","Time"])["topic_0"].mean()

In [None]:
col_agg.groupby(["Left/Right","Time"])["topic_1"].mean()

In [None]:
col_agg.groupby(["Left/Right","Time"])["topic_2"].mean()

In [None]:
col_agg.groupby(["Left/Right","Time"])["topic_3"].mean()

**Topic Starters are big, Corruption is less important during protests, Social Issues also not, Human Rights became important**

## Dictionary

In [None]:
from collections import Counter
import re

In [None]:
#Chile
Human_rights=["policia","carabinero","violencia","humanos","protesta","detención","desaparecido","paco","quemar","fuego","barricada","INDH", "represion", "lacrimogeno",
                                                 "perdigones", "ojo", "ciego", "mutilado","ddhh","dd hh"]
Social_issues=['pensión','AFP','vejez','salud','fonasa','consultorio','hospital','clínica','isapre','dignidad','educación','profesor','liceo','universidad','gratuita',
                                      'sociales','vivienda','campamentos', 'delincuencia']
Corruption=['corrupción','soborno','vendido','nepotismo','deshonesto','ladrón','robar','mentir']
Protest_starter=['metro','tren','pasaje','transporte','evadir','evasión','micro','transantiago','Hutt','alza','tarifa']

In [None]:
def stem_list(input_list):
    porter=SnowballStemmer("spanish")
    output=[]
    for i in input_list:
        output.append(porter.stem(i.lower()))
    return output

In [None]:
Human_rights=stem_list(Human_rights)
Social_issues=stem_list(Social_issues)
Corruption=stem_list(Corruption)
Protest_starter=stem_list(Protest_starter)

In [None]:
topic_list=[Human_rights,Social_issues,Corruption,Protest_starter]
names=["Human_rights","Social_issues","Corruption","Protest_starter"]

In [None]:
def dict_marker(df,dict_list,name_str):
    output_list=[]
    for s in df.stem_text:
        r= re.compile("|".join(r"\b%s\b" % w for w in dict_list))
        wordcount = Counter(re.findall(r, str(s).lower()))
        output_list.append(sum(wordcount.values()))
    df[name_str]=output_list
    df[name_str]=np.where(df[name_str]>0,1,0)
    return df

In [None]:
#Chile
for i in range(0,len(topic_list)):
    chile=dict_marker(chile,topic_list[i],names[i])

In [None]:
chile=dict_marker(chile,Human_rights,"Human_rights")
chile=dict_marker(chile,Social_issues,"Social_issues")
chile=dict_marker(chile,Corruption,"Corruption")
chile=dict_marker(chile,Protest_starter,"Protest_starter")

In [None]:
#Colombia
Human_rights=["policia","esmad","violencia","humanos","protesta","detención","desaparecido","victima","pistola","bomba","gases","lacrimogeno", "represion", "manifestacion", 
                                                   "violacion", "lesion", "sangre", "motin", "proyectiles", "tanqueta", "orden", "perdigones", "ojo", "ciego", "piedra", "antidisturbio", "disturbio",
                                                   "arma", "trauma", "fuerza"]
Social_issues=['pension','narcotrafico','vejez','salud','drogas','paramilitares','paraco','guerrilla','farc','educacion','pobreza','educacion', 'publica', 
                                         'gratuita','sociales','vivienda','desigualdad', 'venezolanos', 'venezuela', 'conflicto', 'reclutamiento', 'desigualdad', 'alimentos', 'desempleo', 
                                         'delincuencia', 'crimen', 'bandas', 'terrorismo', 'robos', 'atracos', 'discriminacion', 'ilegal', 'populismo', 'transporte', 'trancon', 'movilidad']
                        
Corruption=['corrupcion', 'parapolitica','vendido','deshonesto','ladrón','robar','mentir', 'recursos', 'contraloria', 'fiscalia', 'presupuesto', 'politicos', 'ratas', 
                                      'elecciones', 'escandalo', 'odebrecht', 'contratacion', 'mermelada', 'votos', 'sic', 'procuraduria', 'ñeñepolitica', 'carrusel', 'dinero']
                        
Protest_starter=["reforma", "impuestos", "pobreza", "canasta", "precio", "carrasquilla", "ipc", "estrato",  "tributaria", "solaridad", "sostenible", "ley", 
                                           "desempleo", "clase", "recaudo", "deuda", "renta", "iva", "servicios", "patrimonio", "finanzas", "inversion", "exenciones", "economia", "ahorro", 
                                            "recursos"]

In [None]:
#Colombia
for i in range(0,len(topic_list)):
    colombia=dict_marker(colombia,topic_list[i],names[i])

In [None]:
colombia.groupby(["Left/Right","Time"])["Human_rights"].mean()

**Chile**

In [None]:
fig, axs = plt.subplots(figsize=(20, 4))

chile.groupby(chile['days_since'])["Human_rights"].mean().plot(kind='line', rot=0, ax=axs)

plt.xlabel("Time in Days (0=Protest Start)")
plt.ylabel("Share Human Rights Tweets (in %)")

In [None]:
fig, axs = plt.subplots(figsize=(20, 4))

chile.groupby(chile['days_since'])["Social_issues"].mean().plot(kind='line', rot=0, ax=axs)

plt.xlabel("Time in Days (0=Protest Start)")
plt.ylabel("Share Social Issues Tweets (in %)")

In [None]:
fig, axs = plt.subplots(figsize=(20, 4))

chile.groupby(chile['days_since'])["Corruption"].mean().plot(kind='line', rot=0, ax=axs)

plt.xlabel("Time in Days (0=Protest Start)")
plt.ylabel("Share Corruption Tweets (in %)")

In [None]:
fig, axs = plt.subplots(figsize=(20, 4))

chile.groupby(chile['days_since'])["Protest_starter"].mean().plot(kind='line', rot=0, ax=axs)

plt.xlabel("Time in Days (0=Protest Start)")
plt.ylabel("Share Protest Starter Tweets (in %)")

**Colombia**

In [None]:
fig, axs = plt.subplots(figsize=(20, 4))

colombia.groupby(colombia['days_since'])["Human_rights"].mean().plot(kind='line', rot=0, ax=axs)

plt.xlabel("Time in Days (0=Protest Start)")
plt.ylabel("Share Human Rights Tweets (in %)")

In [None]:
fig, axs = plt.subplots(figsize=(20, 4))

colombia.groupby(colombia['days_since'])["Social_issues"].mean().plot(kind='line', rot=0, ax=axs)

plt.xlabel("Time in Days (0=Protest Start)")
plt.ylabel("Share Social Issues Tweets (in %)")

In [None]:
fig, axs = plt.subplots(figsize=(20, 4))

colombia.groupby(colombia['days_since'])["Corruption"].mean().plot(kind='line', rot=0, ax=axs)

plt.xlabel("Time in Days (0=Protest Start)")
plt.ylabel("Share Corruption Tweets (in %)")

In [None]:
fig, axs = plt.subplots(figsize=(20, 4))

colombia.groupby(colombia['days_since'])["Protest_starter"].mean().plot(kind='line', rot=0, ax=axs)

plt.xlabel("Time in Days (0=Protest Start)")
plt.ylabel("Share Protest Starter Tweets (in %)")

**Seaborn**

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
p1 = sns.lineplot(ax=ax, x='days_since', y="Human_rights", data=chile, hue='Categorie')
p1.set_title( "Share of Human Rights Tweets")
p1.set_xlabel( "Timeline in Days (0: Start Protest)")
p1.set_ylabel( "Share of Human Rights Tweets")

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
p1 = sns.lineplot(ax=ax, x='days_since', y="Social_issues", data=chile, hue='Categorie')
p1.set_title( "Share of Social Issues Tweets")
p1.set_xlabel( "Timeline in Days (0: Start Protest)")
p1.set_ylabel( "Share of Social Issues Tweets")

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
p1 = sns.lineplot(ax=ax, x='days_since', y="Corruption", data=chile, hue='Categorie')
p1.set_title( "Share of Corruption Tweets")
p1.set_xlabel( "Timeline in Days (0: Start Protest)")
p1.set_ylabel( "Share of Corruption Tweets")

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
p1 = sns.lineplot(ax=ax, x='days_since', y="Protest_starter", data=chile, hue='Categorie')
p1.set_title( "Share of Protest Starter Tweets")
p1.set_xlabel( "Timeline in Days (0: Start Protest)")
p1.set_ylabel( "Share of Protest Starter Tweets")

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
p1 = sns.lineplot(ax=ax, x='days_since', y="Human_rights", data=colombia, hue='Categorie')
p1.set_title( "Share of Human Rights Tweets")
p1.set_xlabel( "Timeline in Days (0: Start Protest)")
p1.set_ylabel( "Share of Human Rights Tweets")

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
p1 = sns.lineplot(ax=ax, x='days_since', y="Social_issues", data=colombia, hue='Categorie')
p1.set_title( "Share of Social Issues Tweets")
p1.set_xlabel( "Timeline in Days (0: Start Protest)")
p1.set_ylabel( "Share of Social Issues Tweets")

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
p1 = sns.lineplot(ax=ax, x='days_since', y="Social_issues", data=colombia, hue='Left/Right')
p1.set_title( "Share of Social Issues Tweets")
p1.set_xlabel( "Timeline in Days (0: Start Protest)")
p1.set_ylabel( "Share of Social Issues Tweets")

In [None]:
colombia[colombia["Social_issues"]==1].groupby(["Left/Right","Time"])["sentiment"].mean()

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
p1 = sns.lineplot(ax=ax, x='days_since', y="Corruption", data=colombia, hue='Categorie')
p1.set_title( "Share of Corruption Tweets")
p1.set_xlabel( "Timeline in Days (0: Start Protest)")
p1.set_ylabel( "Share of Corruption Tweets")

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
p1 = sns.lineplot(ax=ax, x='days_since', y="Protest_starter", data=colombia, hue='Categorie')
p1.set_title( "Share of Protest Starter Tweets")
p1.set_xlabel( "Timeline in Days (0: Start Protest)")
p1.set_ylabel( "Share of Protest Starter Tweets")

## Unused

In [None]:
colombian_dictionaries={'Violence_HumanRights':["policia","esmad","violencia","humanos","protesta","detención","desaparecido","victima","pistola","bomba","gases","lacrimogeno", "represion", "manifestacion", 
                                                   "violacion", "lesion", "sangre", "motin", "proyectiles", "tanqueta", "orden", "perdigones", "ojo", "ciego", "piedra", "antidisturbio", "disturbio",
                                                   "arma", "trauma", "fuerza"],
                        'Social_Issues':['pension','narcotrafico','vejez','salud','drogas','paramilitares','paraco','guerrilla','farc','educacion','pobreza','educacion', 'publica', 
                                         'gratuita','sociales','vivienda','desigualdad', 'venezolanos', 'venezuela', 'conflicto', 'reclutamiento', 'desigualdad', 'alimentos', 'desempleo', 
                                         'delincuencia', 'crimen', 'bandas', 'terrorismo', 'robos', 'atracos', 'discriminacion', 'ilegal', 'populismo', 'transporte', 'trancon', 'movilidad'],
                        
                        'Corruption':['corrupcion', 'parapolitica','vendido','deshonesto','ladrón','robar','mentir', 'recursos', 'contraloria', 'fiscalia', 'presupuesto', 'politicos', 'ratas', 
                                      'elecciones', 'escandalo', 'odebrecht', 'contratacion', 'mermelada', 'votos', 'sic', 'procuraduria', 'ñeñepolitica', 'carrusel', 'dinero'],
                        
                        'Protest _Starter':["reforma", "impuestos", "pobreza", "canasta", "precio", "carrasquilla", "ipc", "estrato",  "tributaria", "solaridad", "sostenible", "ley", 
                                           "desempleo", "clase", "recaudo", "deuda", "renta", "iva", "servicios", "patrimonio", "finanzas", "inversion", "exenciones", "economia", "ahorro", 
                                            "recursos"] }

In [None]:
chilean_dictionaries={'Violence_HumanRights':["policia","carabinero","violencia","humanos","protesta","detención","desaparecido","paco","quemar","fuego","barricada","INDH", "represion", "lacrimogeno",
                                                 "perdigones", "ojo", "ciego", "mutilado","ddhh","dd hh"],
                     'Social_Issues':['pensión','AFP','vejez','salud','fonasa','consultorio','hospital','clínica','isapre','dignidad','educación','profesor','liceo','universidad','gratuita',
                                      'sociales','vivienda','campamentos', 'delincuencia'],
                     'Corruption':['corrupción','soborno','vendido','nepotismo','deshonesto','ladrón','robar','mentir'],
                     'Protest_Starter':['metro','tren','pasaje','transporte','evadir','evasión','micro','transantiago','Hutt','alza','tarifa'] }

In [None]:
df.Categorie.value_counts()

In [None]:
#getting emotion score for each term
import text2emotion as te
import nltk
nltk.download('omw-1.4')

output = pd.DataFrame()
for index, row in ch_pol_agg.iterrows():
    emotion=te.get_emotion(row["joined_text"])
    output = output.append(emotion, ignore_index=True)

In [None]:
# Define matrix,currently not needed!
from sklearn.feature_extraction.text import CountVectorizer
tf_vect = CountVectorizer(
    max_df=0.6, 
    min_df=20,
    max_features=10000
)
tf = tf_vect.fit_transform(text)