<div style="display: flex; height:400px; text-align:center">
     <img src="https://imgur.com/WmCHcjw.jpg" style="display:block; margin:auto" width=300>
</div>

# 1. Imports

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re
import unicodedata
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud

# 2. Loading data

In [None]:
df_1 = pd.read_csv("/kaggle/input/trump-tweets/trumptweets.csv")
df_2 = pd.read_csv("/kaggle/input/trump-tweets/realdonaldtrump.csv")

In [None]:
print(df_1.shape)
print(df_2.shape)

In [None]:
df_1.head(3)

In [None]:
df_2.head(3)

# 3. Merge dataframes

In [None]:
df_1 = df_1[["content", "date", "retweets", "favorites", "mentions", "hashtags"]]
df_2 = df_2[["content", "date", "retweets", "favorites", "mentions", "hashtags"]]
#
df_dt = pd.concat([df_1, df_2], axis=0)
df_dt.shape

# 4. Duplicates

In [None]:
size_before = len(df_dt)
df_dt = df_dt.drop_duplicates(subset=["content"])
size_after = len(df_dt)
print(str(size_before - size_after) + " duplicates were removed.")

I didn't understood that the dataframes were almost identical

# 5. Overview

## 5.1. Tweets length

In [None]:
max_tweet_length = 0
tweet_length = []
#
for tweet in df_dt["content"]:
    tweet_length.append(len(tweet))
    if len(tweet) > max_tweet_length:
        max_tweet_length = len(tweet)
print("Longest tweet: " + str(max_tweet_length) + " characters")

## 5.2. Missing values

In [None]:
100 * df_dt.isnull().sum().sort_values(ascending=False)/len(df_dt)

86% values are missing in hashtag column and 46% values are missing in mentions column, we can drop them.

In [None]:
df_dt.drop(columns=["hashtags", "mentions"], inplace=True)

In [None]:
100 * df_dt.isnull().sum().sort_values(ascending=False)/len(df_dt)

In [None]:
parameters = {'axes.labelsize': 20,
              'axes.titlesize': 30}
#
plt.rcParams.update(parameters)
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18.5, 6)
sns.histplot(df_dt["retweets"], palette='Blues', stat='count', bins=50, ax=ax1);
ax1.set_xlabel('Retweets count');
sns.histplot(df_dt["favorites"], palette='Blues', stat='count', bins=50, ax=ax2);
ax2.set_xlabel('Favorites count');
ax1.tick_params(axis='x', labelsize=16)
ax1.tick_params(axis='y', labelsize=16)
ax1.set_ylabel("")
ax1.set_xlim(-10, 50000)
ax2.tick_params(axis='x', labelsize=16)
ax2.tick_params(axis='y', labelsize=16)
ax2.set_ylabel("")
ax2.set_xlim(-10, 200000)
fig.tight_layout(pad=2.0)
plt.rcParams.update(parameters)

## 6. Cleaning

In [None]:
df = df_dt.copy()
df["cleanTweet"] = df["content"]
label = 'cleanTweet'

In [None]:
# lowercase
df[label] = df[label].str.lower()

# remove \n \r \t
df[label] = df[label].apply(lambda x: x.replace("\n", " "))
df[label] = df[label].apply(lambda x: x.replace("\r", " "))
df[label] = df[label].apply(lambda x: x.replace("\t", " "))

# remove emails
df[label] = df[label].apply(lambda x: re.sub(r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""", " ", x))

# remove mentions
df[label] = df[label].apply(lambda x: x.replace("@ ", "@"))
df[label] = df[label].apply(lambda x: re.sub(r"@([a-zA-Z0-9_.-]{1,100})", " ", x))

# remove hyperlinks
df[label] = df[label].apply(lambda x: re.sub(r"http\S+", " ", x))

# remove hashtags
df[label] = df[label].apply(lambda x: re.sub(r"#\w+", " ", x))

# remove html tags
df[label] = df[label].apply(lambda x: re.sub(r"<.*?>", " ", x))

# remove numbers
df[label] = df[label].apply(lambda x: re.sub(r"\d+", " ", x))

# encode unknown characters
df[label] = df[label].apply(lambda x: unicodedata.normalize("NFD", x).encode('ascii', 'ignore').decode("utf-8"))

# remove punctuation and accented characters
df[label] = df[label].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

In [None]:
def remove_stop_words(text, stopwords=set(stopwords.words('english'))):
    """ This function removes stop words from a text
        inputs:
         - stopword list
         - text """

    # prepare new text
    text_splitted = text.split(" ")
    text_new = list()
    
    # stop words updated
    # stopwords = stopwords.union({"amp", "grocery store", "covid", "supermarket", "people", "grocery", "store", "price", "time"})
    
    # loop
    for word in text_splitted:
        if word not in stopwords:
            text_new.append(word)
    return " ".join(text_new)

def clean_stopwords(df, label):
    """ This function removes stopwords """
    df[label] = df[label].apply(lambda x: remove_stop_words(x))
    return df
#
df = clean_stopwords(df, label)

In [None]:
# removes remaining one-letter words and two letters words 
df[label] = df[label].apply(lambda x: re.sub(r'\b\w{1,2}\b', " ", x))

# replaces multiple spaces by one single space
df[label] = df[label].apply(lambda x: re.sub(r"[ \t]{2,}", " ", x))

# drop empty lines
df[label] = df[label].apply(lambda x: x if len(x) != 1 else '')
df[label] = df[label].apply(lambda x: np.nan if x == '' else x)
df = df.dropna(subset=[label], axis=0).reset_index(drop=True).copy()

In [None]:
def lemmatize_one_text(text):
    """ This function lemmatizes words in text (it changes word to most close root word)
        inputs:
         - lemmatizer
         - text """

    # initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # tags
    lem_tags = ['a', 'r', 'n', 'v']

    # prepare new text
    text_splitted = text.split(" ")
    text_new = list()

    # change bool
    changed = ''
    
    # loop
    for word in text_splitted:
        #text_new.append(lemmatizer.lemmatize(word))
        changed = ''
        for tag in lem_tags:
            if lemmatizer.lemmatize(word, tag) != word:
                changed = tag
        if changed == '':
            text_new.append(word)
        else:
            text_new.append(lemmatizer.lemmatize(word, changed))

    return " ".join(text_new)

def lemmatize(df, label):
    """ This function lemmatizes texts """
    df[label] = df[label].apply(lambda x: lemmatize_one_text(x))
    return df
#
df = lemmatize(df, label)

In [None]:
df.head(5)

In [None]:
dtf = df.copy()

In [None]:
dtf['word_count'] = dtf["cleanTweet"].apply(lambda x: len(str(x).split(" ")))
dtf['char_count'] = dtf["cleanTweet"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))
dtf.head()

In [None]:
parameters = {'axes.labelsize': 20,
              'axes.titlesize': 30}
#
plt.rcParams.update(parameters)
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18.5, 6)
sns.histplot(dtf["word_count"], palette='Blues', stat='count', bins=30, ax=ax1);
ax1.set_xlabel('Word count');
sns.histplot(dtf["char_count"], palette='Blues', stat='count', bins=30, ax=ax2);
ax2.set_xlabel('Character count');
ax1.tick_params(axis='x', labelsize=16)
ax1.tick_params(axis='y', labelsize=16)
ax1.set_ylabel("")
#ax1.set_xlim(-10, 50000)
ax2.tick_params(axis='x', labelsize=16)
ax2.tick_params(axis='y', labelsize=16)
ax2.set_ylabel("")
#ax2.set_xlim(-10, 200000)
fig.tight_layout(pad=2.0)
plt.rcParams.update(parameters)

# 7. VADER sentiment analysis

In [None]:
def compute_vader_scores(df, label):
    sid = SentimentIntensityAnalyzer()
    df["vader_neg"] = df[label].apply(lambda x: sid.polarity_scores(x)["neg"])
    df["vader_neu"] = df[label].apply(lambda x: sid.polarity_scores(x)["neu"])
    df["vader_pos"] = df[label].apply(lambda x: sid.polarity_scores(x)["pos"])
    df["vader_comp"] = df[label].apply(lambda x: sid.polarity_scores(x)["compound"])
    return df

In [None]:
%%time
df = compute_vader_scores(df, "cleanTweet")

In [None]:
df['comp_score'] = df['vader_comp'].apply(lambda c: 'pos' if c >=0 else 'neg')

In [None]:
percent_pos = round(100*df[df['comp_score']=="pos"].shape[0]/df['comp_score'].shape[0], 2)
percent_neg = round(100 - percent_pos, 2)
print(str(percent_pos) + "% of Donald Trump tweets have positive sentiment according to VADER")
print(str(percent_neg) + "% of Donald Trump tweets have negative sentiment according to VADER")

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 6)
fig.suptitle("Count", fontsize=12)
df.groupby("comp_score").count()[["content"]].rename(columns={"content": "count"}).plot(kind="barh", legend=False, 
        ax=ax).grid(axis='x')
plt.text(df["comp_score"].value_counts()[0]*(1-0.1), 0.96, str(percent_pos)+"%", fontdict={"fontsize": 20, "color": "white"})
plt.text(df["comp_score"].value_counts()[1]*(1-0.3), -0.04, str(percent_neg)+"%", fontdict={"fontsize": 20, "color": "white"})
plt.show()

# 8. WordCloud

In [None]:
cloud_negative_tweets = " ".join([text for text in df[df["comp_score"]=="neg"]["cleanTweet"]])
wordcloud_negative = WordCloud(width=800, height=600, max_font_size=120, background_color="white", colormap="Reds").generate(cloud_negative_tweets)
#
cloud_positive_tweets = " ".join([text for text in df[df["comp_score"]=="pos"]["cleanTweet"]])
wordcloud_positive = WordCloud(width=800, height=600, max_font_size=120, background_color="white", colormap="Greens").generate(cloud_positive_tweets)

In [None]:
parameters = {'axes.labelsize': 12,
              'axes.titlesize': 10}
#
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18.5, 7)
ax1.imshow(wordcloud_positive, interpolation='bilinear')
ax1.axis("off")
ax1.set_title("WordCloud of positive tweets", fontsize=12)
ax2.imshow(wordcloud_negative, interpolation='bilinear')
ax2.axis("off")
ax2.set_title("WordCloud of negative tweets", fontsize=12)
plt.rcParams.update(parameters)
plt.show()