In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install vaderSentiment

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(['co', 'wa', 'ha'])

In [None]:
data = pd.read_csv('../input/all-trumps-twitter-insults-20152021/trump_insult_tweets_2014_to_2021.csv')
data['date'] = pd.to_datetime(data.date, errors = 'ignore')
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.dtypes

# Pre-processing tweets

NLP is an art to extract some information from the text. To do so, it's very important to get the data cleaned by lowercasing ALL your text data (although commonly overlooked, is one of the simplest and most effective form of text preprocessing), lemmatize our tweets, remove stopwords and useless punctuation and a bit of cleaning of the text will help us get more understandable data.

In [None]:
# Initialize the Lemmatizer and Whitespace Tokenizer
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

# lemmatize text column by using a lemmatize function
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text.lower())]

def cleantext(txt):
        
    # A bit of cleaning
    txt = txt.lower()
    # remove special characters from text column
    txt = re.sub(r'\W', ' ', txt)
    #Remove twitter handlers
    txt = re.sub('@[^\s]+','', txt)
    #Remove digits
    txt = re.sub(r'\d+',' ', txt)
    # remove urls spaces with single space
    txt = re.sub(r"\"?http\S+", ' ', txt)
    # remove urls spaces with single space
    txt = re.sub(r"\"?www\S+", ' ', txt)
    #remove all single characters
    txt = re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)
    # remove multiple spaces with single space
    txt = re.sub(r'\s+', ' ', txt)
    # Lemmatizes
    txt = lemmatize_text(txt)
    tokens = []
    for w in txt:
        if w not in stop_words:
            tokens.append(w)
        
    return tokens

# General EDA

To understand better our tweets data we must visualize what is happening. First, I am going to check what's the lenght of the tweet's written by Trump. Then I am going to perform a deeper analysis by checking what's his 10 favourite words (most used words), his 10 most used insults and his main targets. This will give us a better idea of our data.

In [None]:
# Let's see the length of the tweets
seq_length = [len(i) for i in data['tweet']]

pd.Series(seq_length).hist(bins=25, color='red')

In [None]:
data['cleaned_tweets'] = data['tweet'].apply(lambda x: cleantext(x))
#data['cleaned_tweets']
data.head(5)

## use explode to expand the lists into separate rows
dtump_tweets = data.cleaned_tweets.explode().to_frame().reset_index(drop=True)
dtump_tweets
# plot dfe
sns.countplot(x='cleaned_tweets', data=dtump_tweets, order=dtump_tweets.cleaned_tweets.value_counts().iloc[:10].index)
plt.xlabel('10 Most common used words in Trump\'s tweets: ')
plt.ylabel('Frequency [%]')
plt.xticks(rotation=70)

It is interesting to check that 3 (bad, never, fake) out of his 10 most used words are considered as "negative" and 2 (great and like) have a positive polarity. Let's keep digging on the insult part of the data.

In [None]:
data['cleaned_insults'] = data['insult'].apply(lambda x: cleantext(x))
#data['cleaned_tweets']
data.head(5)

## use explode to expand the lists into separate rows
dtump_insults = data.cleaned_insults.explode().to_frame().reset_index(drop=True)
dtump_insults
# plot dfe
sns.countplot(x='cleaned_insults', data=dtump_insults, order=dtump_insults.cleaned_insults.value_counts().iloc[:10].index)
plt.xlabel('10 Most common used insults in Trump\'s tweets: ')
plt.ylabel('Frequency [%]')
plt.xticks(rotation=70)

In [None]:
sns.countplot(x='target', data=data, order=data.target.value_counts().iloc[:10].index)
plt.xlabel('Trump\'s targets: ')
plt.ylabel('Frequency [%]')
plt.xticks(rotation=70)

As we can see, Trum has mostly targeted his tweets to the media

# Wordcloud for Trump tweets

WordClouds are a popular way of displaying how important words are in a collection of texts. Basically, the more frequent the word is, the greater space it occupies in the image. One of the uses of WordClouds is to help us get an intuition about what the collection of texts is about

In [None]:
# Get a string of tweets 
tweet_text = ",".join(tw.lower() for tw in data.tweet)

# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50, 
                      max_words=100, 
                      stopwords=stop_words,
                      scale=5,
                      background_color="white").generate(tweet_text)

plt.figure(figsize=(12,8))
plt.title('Most repeated words in tweets',fontsize=15)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Get a string of insults 
insult_text = ",".join(tw.lower() for tw in data.insult)

# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50, 
                      max_words=100, 
                      stopwords=stop_words,
                      scale=5,
                      background_color="white").generate(insult_text)

plt.figure(figsize=(12,8))
plt.title('Most repeated words in tweets',fontsize=15)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Sentiment analysis

[VADER](https://pypi.org/project/vaderSentiment/) (Valence Aware Dictionary and Sentiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media. 

It is used for sentiment analysis of text which has both the polarities i.e. positive/negative. VADER is used to quantify how much of positive or negative emotion the text has and also the intensity of emotion.

## Advantages

Here are the advantages of using VADER which makes a lot of things easier:

<ul>
    <li>It does not require any training data. </li>
    <li>It can very well understand the sentiment of a text containing emoticons, slangs, conjunctions, capital words, punctuations and much more. </li>
    <li>It works excellent on social media text.</li>
    <li> VADER can work with multiple domains. </li>
</ul>

In [None]:
analyzer = SentimentIntensityAnalyzer() 

In [None]:
data['neg'] = data['tweet'].apply(lambda x:analyzer.polarity_scores(x)['neg'])
data['neu'] = data['tweet'].apply(lambda x:analyzer.polarity_scores(x)['neu'])
data['pos'] = data['tweet'].apply(lambda x:analyzer.polarity_scores(x)['pos'])
data['compound'] = data['tweet'].apply(lambda x:analyzer.polarity_scores(x)['compound'])
data

So what does those results mean?
<ul>
    <li><b>pos</b>: The probability of the sentiment to be positive. </li> 
    <li><b>neu</b>: The probability of the sentiment to be neutral. </li>
    <li><b>neg</b>: The probability of the sentiment to be negative. </li>
    <li><b>compound</b>: The compound score is computed by summing the valence scores of each word in the lexicon, adjusted according to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive). </li>
</ul>


Notice that the pos, neu and neg probabilities add up to 1. Also, the compound score is a very useful metric in case we want a single measure of sentiment. Typical threshold values are the following:
<ul>
    <li><b>positive</b>: compound score>=0.05</li>
    <li><b>neutral</b>: compound score between -0.05 and 0.05</li>
    <li><b>negative</b>: compound score<=-0.05 </li>
</ul>

Now with this results we'll see what's Trump's evolution by plotting the compound sentiment score by year.

In [None]:
yearperiod = data.date.dt.to_period("Y")
ygroup = data.groupby(yearperiod)
ygroup.mean()

In [None]:
#data = data.sort_values('date', ascending=True)
plt.plot(np.unique(yearperiod.values.astype(str)), ygroup.mean()['compound'])
plt.title('Vader\'s compound score per year',fontsize=15)
plt.xlabel('Vader\'s compound score')
plt.ylabel('Year')
plt.xticks(rotation='vertical')

In [None]:
#data = data.sort_values('date', ascending=True)
plt.plot(np.unique(yearperiod.values.astype(str)), ygroup.mean()['pos'])
plt.title('Vader\'s positive score per year',fontsize=15)
plt.xlabel('Vader\'s possitive score')
plt.ylabel('Year')
plt.xticks(rotation='vertical')

In [None]:
#data = data.sort_values('date', ascending=True)
plt.plot(np.unique(yearperiod.values.astype(str)), ygroup.mean()['neg'])
plt.title('Vader\'s negative score per year',fontsize=15)
plt.xlabel('Vader\'s negative score')
plt.ylabel('Year')
plt.xticks(rotation='vertical')

As we can see, more positive tweets have been tweeted since 2017. There's certaintly a peak of negativitiness from 2014 to 2015 which has been mantained untill 2017 but then the negativity decreases a lot and very abruptly. But as we can see in the next cell, the number of tweets per year have been increasing since 2014. Of course, there are no much 2021 tweets as we have just started the year, but in ay case there's a tendency of lowering the negativity.

In [None]:
data.date.dt.year.value_counts()

Citation for VADER:

Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.