In [3]:
import os
import pandas as pd
from deep_translator import (GoogleTranslator,
                             PonsTranslator,
                             LingueeTranslator,
                             MyMemoryTranslator)
import string 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from textblob import TextBlob

## Downloading Data

In [4]:
usernames = ["jokowi"]

In [5]:
def save_tweets(username):
    data = None

    os.system(
            f'snscrape --jsonl --progress --max-results 500 twitter-user {username} > twitter-@{username}-small.json')

    tweets_df = pd.read_json(f'twitter-@{username}-small.json', lines=True)
    df = tweets_df[["id", "url", "date", "content",
                        "hashtags", "cashtags", "media", "lang"]]
    if data is None:
        data = df
    else:
        data = data.append(df)
        
    #Translate the tweets
    if df.lang[0] != "en" and df.lang[17] != "en":
        df['translation'] = df.apply(lambda row: GoogleTranslator(source='auto', target='english').translate(text=f'{row.content}'), axis=1)
    
    #Save as csv file
    df.to_csv(f"{username}_tweets-small.csv")

In [6]:
%%time
for username in usernames:
    save_tweets(username)

Scraping, 100 results so far
Scraping, 200 results so far
Scraping, 300 results so far
Scraping, 400 results so far
Scraping, 500 results so far
Stopped scraping after 500 results due to --max-results


KeyboardInterrupt: 

## Processing Data

In [7]:
data = pd.read_csv("EmmanuelMacron_tweets.csv")
data = data.drop(columns=['Unnamed: 0'])
data.head(2)

FileNotFoundError: [Errno 2] No such file or directory: 'EmmanuelMacron_tweets.csv'

In [None]:
def remove_punctuation(s):
    s = ''.join([i for i in s if i not in string.punctuation])
    return s

In [None]:
def lower_case(s):
    s = s.lower()
    return s

In [None]:
def remove_numbers(s):
    s = ''.join(word for word in s if not word.isdigit())
    return s

In [None]:
def lemmatize(s):
    lemmatizer = WordNetLemmatizer()
    s = ''.join(lemmatizer.lemmatize(word) for word in s)
    return s

In [None]:
def stop_words(s):
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(s) 
    s = ' '.join(w for w in word_tokens if not w in stop_words)
    return s

In [None]:
#Cleaning the strings
data['clean_translation'] = data['Translation'].apply(lemmatize).\
apply(lower_case).apply(remove_numbers).apply(remove_punctuation).apply(stop_words)
data.head(3)

## Textblob Sentiment Analysis

In [None]:
def polarity(s):
    pol = TextBlob(s)
    pol = pol.sentiment[0]
    return pol

def objectivity(s):
    obj = TextBlob(s)
    obj = obj.sentiment[1]
    return obj

In [None]:
data['polarity'] = data['clean_translation'].apply(polarity)
data['objectivity'] = data['clean_translation'].apply(objectivity)
data.head(1)

In [None]:
data['polarity'].mean()

In [None]:
data['objectivity'].mean()

## LDA

In [None]:
vectorizer = TfidfVectorizer().fit(data['clean_translation'])

data_vectorized = vectorizer.transform(data['clean_translation'])

lda_model = LatentDirichletAllocation(n_components=4).fit(data_vectorized)

In [None]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)