In [581]:
import os
import re
import pandas as pd
from deep_translator import (GoogleTranslator,
                             PonsTranslator,
                             LingueeTranslator,
                             MyMemoryTranslator)
import string 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from textblob import TextBlob
import plotly.express as px

## Importing Data

In [422]:
os.getcwd()

'/home/reichardtma/code/reichardtma/prestweets/notebooks'

In [423]:
csv_path = os.path.join("..","raw_data")
csv_path

'../raw_data'

## Processing Data

In [424]:
def remove_unnnamed(data):
    data = data.drop(columns=['Unnamed: 0'])
    return data

In [425]:
def remove_emojis(df):
    #df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    if "Translation" in df:
        res = df[df['Translation'].apply(lambda x: not isinstance(x, float))]
    else:
        res = df[df['content'].apply(lambda x: not isinstance(x, float))]
    return res

In [458]:
def remove_emtpy_translation(df):
    if "Translation" in df:
        df = df.dropna(subset=['Translation'])
    else:
        df = df.dropna(subset=['content'])
    return df

In [474]:
pres = {}

In [630]:
johnson = pd.read_csv(os.path.join(csv_path, 'borisjohnson_tweets.csv'))
pres['johnson'] = johnson

In [631]:
cyril = pd.read_csv(os.path.join(csv_path, 'CyrilRamaphosa_tweets.csv'))
pres['cyril'] = cyril

In [632]:
macron = pd.read_csv(os.path.join(csv_path, 'EmmanuelMacron_tweets.csv'))
pres['macron'] = macron

In [633]:
jair = pd.read_csv(os.path.join(csv_path, 'jairbolsonaro_tweets.csv'))
pres['jair'] = jair

In [634]:
joko = pd.read_csv(os.path.join(csv_path, 'jokowi_tweets.csv'))
pres['joko'] = joko

In [None]:
#Maybe concetanating the individual dataframes together?

#names= ['johnson', 'cyril', 'macron', 'jair', 'joko']
#datas = [johnson, cyril, macron, jair, joko]
#for name in names:
#    for data in datas:
#        data.insert(0, 'name', name).drop(columns=['Unnamed: 0'])

In [480]:
tweets = {}
for name, data in pres.items():
    data = remove_unnnamed(data)
    data = remove_emojis(data)
    data = remove_emtpy_translation(data)
    tweets[name] = data

In [482]:
def remove_url(s):
    try:
        s = re.sub(r'http\S+', '', s)
        return s
    except:
        return s

In [483]:
def remove_punctuation(s):
    s = ''.join([i for i in s if i not in string.punctuation])
    return s

In [484]:
def lower_case(s):
    s = s.lower()
    return s

In [485]:
def remove_numbers(s):
    s = ''.join(word for word in s if not word.isdigit())
    return s

In [486]:
def lemmatize(s):
    lemmatizer = WordNetLemmatizer()
    s = ''.join(lemmatizer.lemmatize(word) for word in s)
    return s

In [487]:
def stop_words(s):
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(s) 
    s = ' '.join(w for w in word_tokens if not w in stop_words)
    return s

In [489]:
#Cleaning the strings
for name, data in tweets.items():
    if "Translation" in data:
        data['clean_content'] = data['Translation'].apply(remove_url).apply(remove_punctuation).\
        apply(lower_case).apply(remove_numbers).apply(stop_words).apply(lemmatize)
    else:
        data['clean_content'] = data['content'].apply(remove_url).apply(remove_punctuation).\
        apply(lower_case).apply(remove_numbers).apply(stop_words).apply(lemmatize)
    tweets[name] = data

## Textblob Sentiment Analysis

In [500]:
def polarity(s):
    pol = TextBlob(s)
    pol = pol.sentiment[0]
    return pol

def objectivity(s):
    obj = TextBlob(s)
    obj = obj.sentiment[1]
    return obj

In [501]:
sentiment = {}
for person, data in tweets.items():
    data['polarity'] = data['clean_content'].apply(polarity)
    data['objectivity'] = data['clean_content'].apply(objectivity)
    sentiment[person] = data

In [578]:
sentiment_df = {'polarity': [list(sentiment.values())[i]['polarity'].mean() for i in range(0,len(sentiment))],\
                'objectivity': [list(sentiment.values())[i]['objectivity'].mean() for i in range(0,len(sentiment))]}

sentiment_df = pd.DataFrame(data=sentiment_df, index=sentiment.keys())
sentiment_df

Unnamed: 0,polarity,objectivity
johnson,0.175356,0.432986
cyril,0.104043,0.344474
macron,0.084495,0.266946
jair,0.071982,0.25059
joko,0.138427,0.381524


In [584]:
fig = px.scatter(sentiment_df, x="polarity", y="objectivity", color=sentiment_df.index)
fig.show()

## LDA

In [669]:
def lda_model(tweet_dict):
    for i in range(0, len(tweet_dict)):
        for presname, dataframe in list(tweet_dict.items())[i]:
            vectorizer = TfidfVectorizer().fit(dataframe['clean_content'])
            data_vectorized = vectorizer.transform(dataframe['clean_content'])
            model = LatentDirichletAllocation(n_components=3).fit(data_vectorized)
    return model

In [670]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])

In [671]:
datasets = tweets.items()
for dataset in datasets:
    lda_model = lda_model(dataset)
    print_topics(lda_model, vectorizer)

AttributeError: 'tuple' object has no attribute 'items'

In [668]:
list(tweets.items())[0]

('johnson',
                        id                                                url  \
 0     1506711585468911621  https://twitter.com/BorisJohnson/status/150671...   
 1     1506684769559621633  https://twitter.com/BorisJohnson/status/150668...   
 2     1506675316273692678  https://twitter.com/BorisJohnson/status/150667...   
 3     1506647014209200130  https://twitter.com/BorisJohnson/status/150664...   
 4     1506588800868392962  https://twitter.com/BorisJohnson/status/150658...   
 ...                   ...                                                ...   
 4327   583712249975439360  https://twitter.com/BorisJohnson/status/583712...   
 4328   583710080522002432  https://twitter.com/BorisJohnson/status/583710...   
 4329   583565249111920640  https://twitter.com/BorisJohnson/status/583565...   
 4330   583376627289075712  https://twitter.com/BorisJohnson/status/583376...   
 4331   583374543902941184  https://twitter.com/BorisJohnson/status/583374...   
 
              