## Importing Libraries

In [None]:
import re
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import collections
import seaborn as sns
from ast import literal_eval
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

## Set Parameterts

In [None]:
plt.rcParams['figure.figsize']=(16,8)
sw = set(STOPWORDS) 
sns.set()

## Load Data

In [None]:
df = pd.read_csv("../input/trumps-legacy/Trumps Legcy.csv")
df.head()

## Clean Tweets

#### Emoji Pattern

In [None]:
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

#### Cleaning

In [None]:
def clean_tweets(tweet):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(tweet)
 
    tweet = re.sub(r':', '', tweet)
    tweet = re.sub(r'‚Ä¶', '', tweet)
    tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
 
    tweet = emoji_pattern.sub(r'', tweet)
 
    filtered_tweet = [w for w in word_tokens if not w in stop_words]
    filtered_tweet = []
 
    for w in word_tokens:
        if w not in stop_words and w not in string.punctuation:
            filtered_tweet.append(w)
    return ' '.join(filtered_tweet).lower()

df['text'] = df['text'].apply(lambda x: clean_tweets(x))

In [None]:
df.head()

## Utilizing External Lexicon Data

In [None]:
emo_words_new = pd.read_csv('../input/emolex/DepecheMood.tsv', delimiter='\t')
emo_words_new['EMOTIONS'] = list(emo_words_new.eq(emo_words_new.max(1), axis=0).dot(emo_words_new.columns))
emo_words_new.head()

In [None]:
!pip install xlrd

In [None]:
!pip install openpyxl

In [None]:
emo_words = pd.read_excel('../input/emolex/NRC EmoLex.xlsx')
emo_words = emo_words[emo_words['association'] == 1]
emo_words = emo_words.drop(['association'], axis=1)

In [None]:
pos_words = pd.read_excel('/kaggle/input/sentiment-lexicons/pos-words.xlsx')
neg_words = pd.read_excel('/kaggle/input/sentiment-lexicons/neg-words.xlsx')

In [None]:
filters = ['positive', 'negative']
emolex_sents = emo_words[emo_words.emotion.isin(filters)]
emolex_sents.head()

In [None]:
n_pos = ['positive'] * len(pos_words)
n_neg = ['negative'] * len(neg_words)

temp_pos_df = pd.DataFrame()
temp_pos_df['word'] = list(pos_words.words)
temp_pos_df['emotion'] = n_pos

temp_neg_df = pd.DataFrame()
temp_neg_df['word'] = list(neg_words.words)
temp_neg_df['emotion'] = n_neg

temp_final = pd.concat([temp_pos_df, temp_neg_df])
emolex_sents = pd.concat([emolex_sents, temp_final])

emolex_sents.head()

## Extracting Positive, Negative Words From Tweets

In [None]:
def pos_neg_words(text):
    pos = []
    neg = []
    for word in text.split():
        if word in list(emolex_sents.word):
            emo = emolex_sents[emolex_sents.word == word].iloc[0,1]
            if emo == 'positive':
                pos.append(word)
            elif emo == 'negative':
                neg.append(word)
    return pos, neg

In [None]:
pos = []
neg = []
texts = []
for text in df.text:
    p, n = pos_neg_words(text)
    pos.append(p)
    neg.append(n)

df['positive_words'] = pos
df['negative_words'] = neg

In [None]:
df.head()

## Extracting Emotions From Tweets

In [None]:
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

In [None]:
def emotion_of_text(text):
    emo = []
    for word in text.split():
        if word in list(emo_words_new.WORDS):
            e = emo_words_new[emo_words_new.WORDS == word].iloc[0,-1]
            emo.append(e.lower())
        if word in emoticons_happy:
            print(word)
            emo.append('happy')
        if word in emoticons_sad:
            print(word)
            emo.append('sad')
    return list(set(emo))

#### Uncomment the code in cell below to extract emotions. To save time, I already did it on a personal machine and will load the .csv instead.

In [None]:
# emo = []

# for text in df.text:
#     emo.append(emotion_of_text(text))

# df['emotions_in_tweet'] = emo

In [None]:
df = pd.read_csv('../input/trumpp/trump.csv')
df.head()

## 20 Most Common Positive Words

In [None]:
the_list = list(df.text)
p_words = list(df.positive_words)
flat_list = [item for sublist in p_words for item in literal_eval(sublist)]
p_words_count = collections.Counter(flat_list)
df2 = pd.DataFrame(p_words_count.most_common(20), columns=['word', 'frequency'])
df2.plot(kind='barh', x='word', figsize=(16,8))

## 20 Most Common Negative Words

In [None]:
n_words = list(df.negative_words)
flat_list = [item for sublist in n_words for item in literal_eval(sublist)]
n_words_count = collections.Counter(flat_list)
df3 = pd.DataFrame(n_words_count.most_common(20), columns=['word', 'frequency'])
df3.plot(kind='barh', x='word', figsize=(16,8))

## Emotions Distribution

In [None]:
e_words_count = {}
for emo_list in df.emotions_in_tweet:
    for emo in literal_eval(emo_list):
        if emo in e_words_count:
            e_words_count[emo] += 1
        else:
            e_words_count[emo] = 1

plt.pie([float(v) for v in e_words_count.values()], labels=[k for k in e_words_count],
           autopct=None, startangle=140, explode=(0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02))
plt.axis('equal')
plt.show()

## Words Behind Most Dominant Emotion (Inspired)

In [None]:
neg_p = 0
pos_p = 0 
angry_words = []
for row in df.iterrows():
    if 'inspired' in literal_eval(row[1][-1]):
        neg_p += len(set(literal_eval(row[1][-2])))
        angry_words.append(literal_eval(row[1][-2]))
        pos_p += len(set(literal_eval(row[1][-3])))
        angry_words.append(literal_eval(row[1][-3]))
angry_words = ' '.join([item for sublist in angry_words for item in sublist])
wordcloud = WordCloud(width = 600, height = 600, 
                background_color ='white', 
                stopwords = sw, 
                min_font_size = 10).generate(angry_words) 

#### Findings:
* 62% Positive Words
* 38% Negative Words

In [None]:
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 