# Trump's twitter insults
Hello everybody! Today we will be visualising Trump's various insults over the years.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
trump = pd.read_csv('../input/all-trumps-twitter-insults-20152021/trump_insult_tweets_2014_to_2021.csv')
trump = trump.drop('Unnamed: 0', axis=1)

### Description of features
* **date** - day that the tweet was released
* **target** - subject of ridicule in tweet
* **insult** - offensive statement
* **tweet** - corpus of text released by Trump

In [None]:
trump.head()

# Target

In [None]:
count = Counter(trump['target'])
count = pd.Series(count)
df1 = pd.DataFrame({'Target':count.keys(), 'Number of insults':count}).sort_values(
    ascending=False, by='Number of insults')
df1 = df1[df1['Number of insults']>50]

fig = px.bar(df1, 'Target', 'Number of insults', color='Number of insults', title='Target of insult')
fig.show()

# Insult

In [None]:
count = Counter(trump['insult'])
count = pd.Series(count)
df2 = pd.DataFrame({'Insult':count.keys(), 'Number of insults':count}).sort_values(
    ascending=False, by='Number of insults')
df2 = df2[df2['Number of insults']>18]

fig = px.bar(df2, 'Insult', 'Number of insults', color='Number of insults', title='Type of insult', height=700)
fig.show()

# Insult per target

In [None]:
for target in df1['Target'].iloc[:5]:
    count = Counter(trump[trump['target']==target]['insult'])
    count = pd.Series(count)
    d = pd.DataFrame({'Insult':count.keys(), 'Number of insults':count}).sort_values(
        ascending=False, by='Number of insults')
    d = d[d['Number of insults']>3]

    fig = px.bar(d, 'Insult', 'Number of insults', color='Number of insults', title=target)
    fig.show()

# Tweets per month

In [None]:
count = Counter([i[:7] for i in trump['date']])
df = pd.DataFrame({'Date of tweet':count.keys(), 'Number of tweets':count.values()})
fig = px.line(df, 'Date of tweet', 'Number of tweets', title='Tweets per month')
fig.show()

# Average number of words per tweet

In [None]:
months = []
datas = []
trump['year'] = [o[:4] for o in trump['date']]
for i in trump.groupby('year'):
    i[1]['month'] = [j[5:7] for j in i[1]['date']]
    for k in i[1].groupby('month'):
        months.append(i[0]+'-'+k[0])
        datas.append(k[1]['tweet'])
        
month_list = []
for i in datas:
    length = len(list(i))
    month_count = 0 
    for j in i:
        month_count += len(j.split(' '))
    month_list.append(round(month_count/length, 2))

In [None]:
df = pd.DataFrame({'Date of tweet':months, 'Average number of words per tweet':month_list})
fig = px.line(df, 'Date of tweet', 'Average number of words per tweet', title='Number of words in tweets per month')
fig.show()

# Number of tweets per target

In [None]:
years = []
for target in df1.iloc[:5]['Target']:
    temp = []
    for i in trump[trump['target'] == target]['date']:
        temp.append(i[:7])
    years.append(temp)

for target in years:
    name = df1['Target'].iloc[years.index(target)]
    count = Counter(target)
    df = pd.DataFrame({'Date of tweet':count.keys(), 'Number of tweets against '+name:count.values()})
    fig = px.line(df, 'Date of tweet', 'Number of tweets against '+name, title=name)
    fig.show()

# Number of tweets per insult

In [None]:
years = []
for target in df2.iloc[:5]['Insult']:
    temp = []
    for i in trump[trump['insult'] == target]['date']:
        temp.append(i[:7])
    years.append(temp)

for target in years:
    name = df2['Insult'].iloc[years.index(target)]
    count = Counter(target)
    df = pd.DataFrame({'Date of tweet':count.keys(), 'Number of tweets against '+name:count.values()})
    fig = px.line(df, 'Date of tweet', 'Number of tweets against '+name, title=name)
    fig.show()

# Percentage of insults over the years

In [None]:
count = Counter([int(i[:4]) for i in trump['date']])
df = pd.DataFrame({'Year':count.keys(), 'Number of tweets':count.values()})
fig = px.pie(df, 'Year', 'Number of tweets')
fig.update_layout(legend_title=dict(text='Years', font=dict(size=18)))
fig.show()

# Percentage of insults per months over the years

In [None]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
trump['year'] = [int(i[:4]) for i in trump['date']]

for year in trump.groupby('year'):
    count = Counter([int(j[5:7]) for j in year[1]['date']])
    df = pd.DataFrame({'Month':count.keys(), 'Number of tweets':count.values()})

    month_list = []
    for i in df['Month']:
        month_list.append(months[int(i-1)])
    df['Month'] = month_list

    fig = px.pie(df, 'Month', 'Number of tweets')
    fig.update_layout(legend_title=dict(text='Months in '+str(year[0]), font=dict(size=18)))
    fig.show()

# Wordclouds for most common words

In [None]:
for year in trump.groupby('year'):
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    total = []
    tweet = year[1]['tweet']
    for corpus in tweet:
        for i in corpus.split(' '):
            total.append(i)
    count = Counter(total)
    total = list(count.keys())
    words = ' '.join(total)
    wordcloud = WordCloud(background_color='white').generate(words)
    ax.imshow(wordcloud)
    ax.set_title(year[0], size=30)
    ax.axis('off')
    plt.show()

# Predicting target based on tweet

In [None]:
trump = pd.merge(trump, pd.get_dummies(trump['insult']), left_index=True, right_index=True)
trump = trump.drop('insult', axis=1)
trump = trump.fillna('null')
trump['target'] = LabelEncoder().fit_transform(trump['target'])

In [None]:
X = trump['tweet']
y = trump['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

cv = CountVectorizer()
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

tfidf = TfidfTransformer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

model = LinearSVC()
model.fit(X_train, y_train)
print('Model score:    ', model.score(X_test, y_test))
print('Cross val score:',cross_val_score(model, X_test, y_test).mean())

# Democrats and Republicans

In [None]:
data = pd.read_csv('../input/democratvsrepublicantweets/ExtractedTweets.csv')
data.head()

### Description of features
* **Party** - Political party of politician
* **Handle** - Politican twitter handle
* **Tweet** - Corpus of tweet

# Number of tweets per party

In [None]:
count = Counter(data['Party'])
df = pd.DataFrame({'Party': count.keys(), 'Number of tweets': count.values()})
fig = px.pie(df, 'Party', 'Number of tweets')
fig.update_layout(legend_title=dict(text='Party', font=dict(size=18)))
fig.show()

# Wordcloud for 10 politicians

In [None]:
users = {'Democrat':[], 'Republican':[]}
for party in ['Democrat', 'Republican']:
    r = 0
    for person in np.unique(data['Handle']):
        if np.array(data[data['Handle']==person]['Party'])[0]==party:
            if r == 5:
                break
            else:
                users[party].append(person)
                tweet = data[data['Handle']==person]['Tweet']
                fig, ax = plt.subplots(1, 1, figsize=(10, 10))
                total = []
                for corpus in tweet:
                    for i in corpus.split(' '):
                        total.append(i)
                count = Counter(total)
                total = list(count.keys())
                words = ' '.join(total)

                wordcloud = WordCloud(background_color='white').generate(words)
                ax.imshow(wordcloud)
                ax.set_title(person+' '+np.array(data[data['Handle']==person]['Party'])[0], size=30)
                ax.axis('off')
                plt.show()
            r += 1

# Number of words in tweets per politician

In [None]:
party_counts = [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]
for party in users:
    for user in users[party]:
        for tweet in data[data['Handle']==user]['Tweet']:
            length = len(tweet.split(' '))
            party_counts[list(users.keys()).index(party)][users[party].index(user)] += length
            
for values in party_counts:
    party = list(users.keys())[party_counts.index(values)]
    labels = users[party]
    
    df = pd.DataFrame({'User':labels, 'Number of words in tweets':values})
    fig = px.pie(df, 'User', 'Number of words in tweets')
    fig.update_layout(legend_title=dict(text=party, font=dict(size=18)))
    fig.show()

## Thank you for reading this notebook.
## If you enjoyed this notebook and found it helpful, please give it an upvote and provide feedback, as it would help me make more of these.