In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/all-trumps-twitter-insults-20152021/trump_insult_tweets_2014_to_2021.csv")
df.head()

In [None]:
df.tail()

In [None]:
# Let's drop the "Unnamed: 0"
df.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
df.info()

In [None]:
# Shape of dataframe
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns")

In [None]:
# Find missing values
df.isnull().sum()

In [None]:
# we see that there are 2 missing values on target column, let's drop missing values
df.dropna(inplace=True)

In [None]:
# let's check the datatypes of every column
df.dtypes

In [None]:
# Add a column of hashtag count
df['hashtag_counts'] = df['tweet'].apply(lambda x: x.count('#'))

In [None]:
hashtags_list = []
for sentence in df.tweet:
    for word in sentence.split():
        if word.startswith('#'):
            hashtags_list.append(word[1:])

In [None]:
hashtag_dict = {}
for word in hashtags_list:
    hashtag_dict[word] = hashtag_dict.get(word, 0) + 1

In [None]:
df_hashtag = pd.DataFrame.from_dict(hashtag_dict, orient='index', columns=['count'])

In [None]:
df_hashtag.sort_values('count', ascending=False, inplace=True)

In [None]:
# Top 10 hashtags circulated by trump
df_hashtag[:10]

In [None]:
# Let's convert the date column in datetime format
df['date'] = pd.to_datetime(df.date, yearfirst=True)

In [None]:
# Create seperate columns for year, month and dayofyear
df['year'] = df.date.dt.year
df['month'] = df.date.dt.month
df['dayofyear'] = df.date.dt.dayofyear

In [None]:
df.head()

In [None]:
# Create a column of tweet_length
df['tweet_length'] = df['tweet'].apply(len)

In [None]:
df.describe()

In [None]:
# let's drop all the weblinks from the tweet column
df['tweet'] = df['tweet'].replace(to_replace=r'https:\/\/.*',value='',regex=True).replace(to_replace=r'http:\/\/.*',value='',regex=True).str.strip()

In [None]:
df['tweet'] = df.tweet.str.lower().replace('--', '').replace('-', '')

In [None]:
df

### Data Visualization

#### Wordcloud

In [None]:
# Let's see who trump target the most through wordcloud
target_words = " ".join(df.target)

wc = WordCloud(width=900, height=600).generate(target_words)

plt.figure( figsize=(18,12))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

We see that trumps targets media, hilary clinton the most and he also likes to brag about himself too.

In [None]:
# Let's see who trump insults the most through wordcloud
insult_words = " ".join(df.insult)

wc = WordCloud(width=900, height=600, colormap="inferno").generate(insult_words)

plt.figure( figsize=(18,12))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

Top 10 targets

In [None]:
top10_target = df.target.value_counts().head(10)
top10_target

In [None]:
plt.style.use('fivethirtyeight')
explode = (0.05, 0.04, 0.03, 0.01, 0.02, 0.01, 0.02, 0.01, 0.02, 0.01)
fig, ax = plt.subplots(figsize=(15,9), subplot_kw=dict(aspect="equal"))

def func(pct, allvals):
    absolute = int(pct/100.*np.sum(allvals))
    return "{:.1f}%\n({:d} tweets)".format(pct, absolute)

wedges, texts, autotexts = ax.pie(top10_target, explode=explode, autopct=lambda pct: func(pct, top10_target),
                                  textprops=dict(color="black",))

ax.legend(wedges, top10_target.index,
          title="Target Name",
          loc="center left", prop={'size':14},
          bbox_to_anchor=(1, 0, 0.5, 1))

plt.setp(autotexts, size=12, weight="bold")

ax.set_title("Top 10 Targets", )

# plt.pie(top10_target, explode=explode, labels=top10_target.index, autopct='%1.1f%%', shadow=True, )
plt.show()

In [None]:
plt.figure(figsize=(18,6))
sns.barplot(top10_target.index, top10_target.values, alpha=0.8, palette="magma")
plt.title('Trump Targets',fontsize=20)
plt.ylabel('Insults', fontsize=16)
plt.xlabel('Target Name', fontsize=16)
plt.xticks(fontsize=12, rotation=45)
plt.show()

let's seperately analize top 5 target columns

#### The-media

In [None]:
df_media = df.loc[df['target']=='the-media']

In [None]:
print(f"Most target word said about the-media is '{df_media.insult.value_counts().index[0]}'")

In [None]:
media_insults = " ".join(word for word in df_media.insult)

In [None]:
wc = WordCloud(width=800, height=500, colormap="afmhot").generate(media_insults)

plt.figure( figsize=(13,8))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

#### Democrats

In [None]:
df_democrats = df.loc[df['target']=='democrats']

In [None]:
print(f"Most target word said about Democrats is '{df_democrats.insult.value_counts().index[0]}'")

In [None]:
democrats_insults = " ".join(word for word in df_democrats.insult)

In [None]:
wc = WordCloud(width=800, height=500, colormap="gist_yarg").generate(democrats_insults)

plt.figure( figsize=(13,8))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

#### Hillary Clinton

In [None]:
df_hillary = df.loc[df['target']=='hillary-clinton']

In [None]:
print(f"Most insulted word said for Hillary-Clinton is '{df_hillary.insult.value_counts().index[0]}'")

In [None]:
hc_insult = ' '.join(word for word in df_hillary.insult)
# hc_insult

In [None]:
wc = WordCloud(width=800, height=500, colormap="inferno").generate(hc_insult)

plt.figure( figsize=(13,8))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

#### Trump-Russia

In [None]:
df_tr = df.loc[df['target']=='trump-russia']

In [None]:
print(f"Most target word used for Trump-Russia is '{df_tr.insult.value_counts().index[0]}'")

In [None]:
tr_insults = ' '.join(word for word in df_tr.insult)

In [None]:
wc = WordCloud(width=800, height=500, colormap="afmhot").generate(tr_insults)

plt.figure( figsize=(13,8))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

#### Joe Bidan

In [None]:
df_joe = df.loc[df['target']=='joe-biden']

In [None]:
print(f"Most insulted word said for Joe Biden is '{df_joe.insult.value_counts().index[0]}'")

In [None]:
joe_insults = ' '.join(word for word in df_joe.insult)

In [None]:
wc = WordCloud(width=800, height=500, colormap="twilight").generate(joe_insults)

plt.figure( figsize=(13,8))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Let's see what trump tweets the most through wordcloud
tweet_words =" ".join(df.tweet)

wc = WordCloud(width=900, height=600, colormap="inferno").generate(tweet_words)

plt.figure( figsize=(20,15))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
trump_mask = np.array(Image.open('trump.png'))

In [None]:
badwords = ['say', 'said', 'way', 'want', 'make', 'yet', 'give', "don't", 'actually', 'big', 'called', 'used',
            'good', 'look', 'will', 'new', 'many', 'show', 'hard', 'U', 'one', 'see', 'thing', 'problem', 'case',
           'take', "doesn't", 'u s', 'paid', 'total', 'now', 'far', 'true', 'will', 'one', 'New', 'want', 'now', 'know',
           'good', "hillary clinton", 'thank', 'much', 'run', 'word', 'nothing', 'think', 'even', 'make','read','anything','always','good','thing','really','job','lost','show','group',
           'nothing','story','television','political','time','cruz','talk','zero','organization', 'guy','even','deal','false','history','looking',
           'reporting','look','country','poll','say','ratings','vote','money','former','president','press','republican','reporter','politician','magazine',
           'much','debate','debates','times','campaign','presidential','fox','clinton','hillary','bush','credibility','candidate','know','columnist','immigration',
           'another','ad','lied','chief','ted','record','newspaper','another','paid','journal','way','trump','got','life',]

In [None]:
for word in badwords:
    STOPWORDS.add(word)

    stopwords = set(STOPWORDS)

In [None]:
wc = WordCloud(max_words=1500, mask=trump_mask, stopwords=stopwords, mode='RGB', background_color='white', colormap="inferno")
wc.generate(tweet_words)
plt.figure( figsize=(20,15))
plt.imshow(wc, interpolation='bilinear')
plt.tight_layout(pad=0)
plt.axis("off")
plt.title('Most tweets word by Trump', fontsize=20, color='b')
plt.show()

In [None]:
df.head()

In [None]:
# Let's visualize above data on boxplot
sns.boxplot(x='tweet_length', data=df, palette='Set2')

In [None]:
plt.figure(figsize=(12,6))
sns.kdeplot(x='tweet_length', data=df)

we see that all of the tweets are between 150 to 280 characters long

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Now let's plot violinplot every month of every year.
g = sns.FacetGrid(df, row='month', col="year")
g.map(sns.violinplot, "tweet_length")

#### Insults by year

In [None]:
year_insult = df.groupby('year').agg({'insult':'count'})

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(14,6))
sns.barplot(x=year_insult.index, y='insult', data=year_insult, palette='coolwarm')
plt.title('Insults vs year', fontsize=16)
plt.show()

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(14,6))
sns.lineplot(x=year_insult.index, y='insult', data=year_insult)
plt.title('Insults by year', fontsize=16)
plt.show()

Most of the tweets are tweeted in year 2020

In [None]:
df_2020 = df[df['year']==2020]

In [None]:
df_2020.groupby('month').agg({'tweet':'count'}).plot(kind='bar', figsize=(12,7))

Looks like in september and october highest number of tweets are tweeted in year 2020.

#### Insults by months

In [None]:
month_insults = df.groupby('month').agg({'insult':'count'})

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(14,6))
sns.barplot(x=month_insults.index, y='insult', data=month_insults, palette='coolwarm')
plt.title('Insults vs Month', fontsize=16)
plt.show()

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(14,6))
sns.lineplot(x=month_insults.index, y='insult', data=month_insults)
plt.title('Insults by Month', fontsize=16)
plt.show()

In [None]:
df.head()

In [None]:
insultbyday = df.groupby('dayofyear').agg({'insult':'count'})

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(20,6))
sns.lineplot(x=insultbyday.index, y='insult', data=insultbyday)
plt.title('Insult by Day', fontsize=16)
plt.show()

In [None]:
df.head(2)

In [None]:
df.to_csv('twitter_clean_data.csv')