In [92]:
import os
import pandas as pd
from deep_translator import (GoogleTranslator,
                             PonsTranslator,
                             LingueeTranslator,
                             MyMemoryTranslator)
import string 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from textblob import TextBlob

## Downloading Data

In [117]:
usernames = ["jokowi"]

In [118]:
def save_tweets(username):
    data = None

    os.system(
            f'snscrape --jsonl --progress --max-results 500 twitter-user {username} > twitter-@{username}-small.json')

    tweets_df = pd.read_json(f'twitter-@{username}-small.json', lines=True)
    df = tweets_df[["id", "url", "date", "content",
                        "hashtags", "cashtags", "media", "lang"]]
    if data is None:
        data = df
    else:
        data = data.append(df)
        
    #Translate the tweets
    if df.lang[0] != "en" and df.lang[17] != "en":
        df['translation'] = df.apply(lambda row: GoogleTranslator(source='auto', target='english').translate(text=f'{row.content}'), axis=1)
    
    #Save as csv file
    df.to_csv(f"{username}_tweets-small.csv")

In [119]:
%%time
for username in usernames:
    save_tweets(username)

Scraping, 100 results so far
Scraping, 200 results so far
Scraping, 300 results so far
Scraping, 400 results so far
Scraping, 500 results so far
Stopped scraping after 500 results due to --max-results


CPU times: user 11.1 s, sys: 489 ms, total: 11.6 s
Wall time: 1min 46s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['translation'] = df.apply(lambda row: GoogleTranslator(source='auto', target='english').translate(text=f'{row.content}'), axis=1)


## Processing Data

In [144]:
data = pd.read_csv("EmmanuelMacron_tweets.csv")
data = data.drop(columns=['Unnamed: 0'])
data.head(2)

Unnamed: 0,id,url,date,content,hashtags,cashtags,media,lang,Translation
0,1505647680197996546,https://twitter.com/EmmanuelMacron/status/1505...,2022-03-20 20:49:15+00:00,"Face au terrorisme, nous avons résisté. Et nou...",,,"[{'_type': 'snscrape.modules.twitter.Video', '...",fr,"Faced with terrorism, we resisted. And we will..."
1,1505625501141975043,https://twitter.com/EmmanuelMacron/status/1505...,2022-03-20 19:21:07+00:00,"Imad Ibn Ziaten, 30 ans.\nAbel Chennouf, 25 an...",,,"[{'_type': 'snscrape.modules.twitter.Photo', '...",fr,"Imad Ibn Ziaten, 30 years old.\nAbel Chennouf,..."


In [145]:
def remove_punctuation(s):
    s = ''.join([i for i in s if i not in string.punctuation])
    return s

In [146]:
def lower_case(s):
    s = s.lower()
    return s

In [147]:
def remove_numbers(s):
    s = ''.join(word for word in s if not word.isdigit())
    return s

In [148]:
def lemmatize(s):
    lemmatizer = WordNetLemmatizer()
    s = ''.join(lemmatizer.lemmatize(word) for word in s)
    return s

In [149]:
def stop_words(s):
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(s) 
    s = ' '.join(w for w in word_tokens if not w in stop_words)
    return s

In [151]:
#Cleaning the strings
data['clean_translation'] = data['Translation'].apply(lemmatize).\
apply(lower_case).apply(remove_numbers).apply(remove_punctuation).apply(stop_words)
data.head(3)

Unnamed: 0,id,url,date,content,hashtags,cashtags,media,lang,Translation,clean_translation
0,1505647680197996546,https://twitter.com/EmmanuelMacron/status/1505...,2022-03-20 20:49:15+00:00,"Face au terrorisme, nous avons résisté. Et nou...",,,"[{'_type': 'snscrape.modules.twitter.Video', '...",fr,"Faced with terrorism, we resisted. And we will...",faced terrorism resisted continue httpstcoxnef...
1,1505625501141975043,https://twitter.com/EmmanuelMacron/status/1505...,2022-03-20 19:21:07+00:00,"Imad Ibn Ziaten, 30 ans.\nAbel Chennouf, 25 an...",,,"[{'_type': 'snscrape.modules.twitter.Photo', '...",fr,"Imad Ibn Ziaten, 30 years old.\nAbel Chennouf,...",imad ibn ziaten years old abel chennouf years ...
2,1505302971248267267,https://twitter.com/EmmanuelMacron/status/1505...,2022-03-19 21:59:30+00:00,"Après 12 ans d’attente, nos Bleus réalisent l’...",,,,fr,"After 12 years of waiting, our Blues achieve t...",years waiting blues achieve grand slam feat en...


## Textblob Sentiment Analysis

In [152]:
def polarity(s):
    pol = TextBlob(s)
    pol = pol.sentiment[0]
    return pol

def objectivity(s):
    obj = TextBlob(s)
    obj = obj.sentiment[1]
    return obj

In [153]:
data['polarity'] = data['clean_translation'].apply(polarity)
data['objectivity'] = data['clean_translation'].apply(objectivity)
data.head(1)

Unnamed: 0,id,url,date,content,hashtags,cashtags,media,lang,Translation,clean_translation,polarity,objectivity
0,1505647680197996546,https://twitter.com/EmmanuelMacron/status/1505...,2022-03-20 20:49:15+00:00,"Face au terrorisme, nous avons résisté. Et nou...",,,"[{'_type': 'snscrape.modules.twitter.Video', '...",fr,"Faced with terrorism, we resisted. And we will...",faced terrorism resisted continue httpstcoxnef...,0.0,0.0


In [154]:
data['polarity'].mean()

0.08449501490081364

In [155]:
data['objectivity'].mean()

0.2669460345232418

## LDA

In [156]:
vectorizer = TfidfVectorizer().fit(data['clean_translation'])

data_vectorized = vectorizer.transform(data['clean_translation'])

lda_model = LatentDirichletAllocation(n_components=4).fit(data_vectorized)

In [157]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:
[('france', 61.31325575453051), ('want', 45.83554721322677), ('french', 44.87398001218307), ('people', 44.64390815982471), ('country', 30.349194475167817), ('support', 29.23265658338248), ('one', 28.647177890793284), ('us', 27.36923026685369), ('new', 26.875999587910158), ('victims', 24.943364957538634)]
Topic 1:




[('must', 45.582160094684724), ('live', 39.73838772432917), ('france', 38.99298513669721), ('european', 38.08377463282113), ('people', 33.07472456868828), ('europe', 31.458727386373436), ('fight', 27.020493431794996), ('french', 22.553059906154175), ('continue', 22.187410508712727), ('terrorism', 22.129159144261585)]
Topic 2:
[('france', 28.308540570670377), ('live', 22.84085083958959), ('project', 19.68820339848196), ('french', 19.469753436686542), ('europe', 17.04621068913334), ('want', 16.059883833147296), ('must', 14.164687540586979), ('time', 12.708293905963291), ('today', 12.333275657038367), ('year', 12.226481500034208)]
Topic 3:
[('europe', 79.16311328123206), ('france', 62.92259962289624), ('us', 52.92080338243677), ('together', 51.66617192337578), ('must', 48.58572407944742), ('european', 48.25548054449619), ('want', 38.226274028669), ('lets', 37.37109791983955), ('french', 34.17537905583736), ('new', 32.420372446209775)]
