In [2]:
from nltk import download
from html.parser import HTMLParser
import re
from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from itertools import groupby
import pandas as pd
import json
import numpy as np
import ast
from collections import Counter
import glob
import os
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.decomposition import PCA, LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import TfidfVectorizer

#download('punkt')
#download('wordnet')

In [3]:
warnings.filterwarnings('ignore')

In [4]:
def extract_words(str_of_words):
    """ 
    return the list of words in the string
    """
    new_word_list = []
    word_list = word_tokenize(str_of_words)
    word_list = [word.lower() for word in word_list if word[0].isalpha() and len(word)>1]
        
    return word_list

list_text = 'RT @NRA: #IDontTrustPeopleThat want to ban all guns, repeal #2A, and/or infringe on the right of the people to keep and bear Arms. #NRA #Deâ€¦'

#extract_words(list_text)

In [5]:
# remove stop words
def remove_stop_words(list_words):
    new_words = []
    punctuation = list(string.punctuation)
    stop = stopwords.words("english")+punctuation+['rt', 'via']

    for word in list_words:
        if word not in stop:
            new_words.append(word)
    return new_words

#remove_stop_words(['i', 'luv', 'my'])

In [6]:
def lemmatize_words(list_words):
    """ 
    count number of easy words from the list of words
    """
    lemmatized_words = []
    lemmatizer = WordNetLemmatizer()
    for word in list_words:
        word_n = lemmatizer.lemmatize(word, 'n')
        word_v = lemmatizer.lemmatize(word, 'v')
        lemmatized_words.append(word_n)
        if (word != word_n and word_n not in word_v and word_v not in word_n):
            lemmatized_words.append(word_v)
    return lemmatized_words

print(lemmatize_words(['dont', 'trust', 'people', 'want', 'ban', 'guns', 'repeal', 'and/or', 'infringe', 'right', 'people', 'keep', 'bear', 'arms', 'deâ€¦', 'breitbart', 'news', 'kaladious', 'kids', 'hijacked', 'media/', 'politicians', 'realizing', 'powerful']))

['dont', 'trust', 'people', 'want', 'ban', 'gun', 'repeal', 'and/or', 'infringe', 'right', 'people', 'keep', 'bear', 'arm', 'deâ€¦', 'breitbart', 'news', 'kaladious', 'kid', 'hijacked', 'media/', 'politician', 'realizing', 'powerful']


In [7]:
def perform_stemming(list_words):
    ps = PorterStemmer()
    word_list = [ps.stem(word) for word in list_words]
    return word_list

#print(perform_stemming(['dont', 'trust', 'people', 'want', 'ban', 'guns', 'repeal', 'and/or', 'infringe', 'right', 'people', 'keep', 'bear', 'arms', 'deâ€¦', 'breitbart', 'news', 'kaladious', 'kids', 'hijacked', 'media/', 'politicians', 'realizing', 'powerful']))

In [8]:
# standardize words
def standardize_words(word_list):
    standarized_words = []
    for word in word_list:
        word_l= [x[0] for x in groupby(word)]
        word_l = ('').join(word_l)
        standarized_words.append(word_l)
    return standarized_words

In [9]:
RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
# remove emojis
def strip_emoji(text):
    return RE_EMOJI.sub(r'', text)

#print(strip_emoji('baba black sheep 🙄🤔💗'))

In [10]:
df_tweets = pd.read_csv('Cleaned_Tweets.csv')

In [11]:
df_tweets.shape

(87403, 9)

In [12]:
df_tweets.head()

Unnamed: 0,created_at,entities,retweet_count,retweeted,text,keyword,hashtags,location,screen_name
0,2018-05-25,"{'hashtags': [{'indices': [51, 61], 'text': 'H...",0.0,False,@PressSec @Rambobiggs @TheSlyStallone just ano...,antigun,"['Hollywood', 'AntiGun', 'Anti2ndAmendment', '...",in the middle of nowhere USA,Hiwayman64
1,2018-05-25,"{'hashtags': [{'indices': [17, 25], 'text': 'a...",0.0,False,Chew on this you #antigun #liberals #shooting ...,antigun,"['antigun', 'liberals', 'shooting', 'DavidHogg']",,Infidel_Sniper
2,2018-05-25,"{'hashtags': [{'indices': [49, 65], 'text': 'I...",0.0,False,Shannon Watts of @Everytown Is a Left-Wing Fra...,antigun,['IAmForGunRights'],"Ohio, USA",MachewsMAGA
3,2018-05-25,"{'hashtags': [], 'symbols': [], 'urls': [], 'u...",2.0,False,RT @MachewsMAGA: Dying man's lawsuit claims Mo...,antigun,0,Eden,emalvini
4,2018-05-25,"{'hashtags': [], 'symbols': [], 'urls': [{'dis...",2.0,False,Dying man's lawsuit claims Monsanto covered up...,antigun,0,"Ohio, USA",MachewsMAGA


In [13]:
df_tweets.columns

Index(['created_at', 'entities', 'retweet_count', 'retweeted', 'text',
       'keyword', 'hashtags', 'location', 'screen_name'],
      dtype='object')

In [14]:
df_keyword_counts = pd.DataFrame(pd.value_counts(df_tweets.keyword)).reset_index(level=0)
df_keyword_counts.columns = ['keyword', 'count']
df_keyword_counts.to_csv('keywords_counts.csv')

In [15]:
df_tweet_totals = df_tweets.groupby(['created_at', 
    'keyword'])['text'].count().reset_index().sort_values(by = ['text'], 
                                                                    ascending = False)

df_tweet_totals.to_csv('Tweet_Totals.csv')

In [16]:
df_tweets.describe()

Unnamed: 0,retweet_count
count,87402.0
mean,15.177307
std,352.142868
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,60160.0


In [17]:
text_data = df_tweets['text']
print('before, number of nan:', text_data.isnull().sum())
df_tweets.text.fillna('', inplace=True)
print('after, number of nan:',text_data.isnull().sum())

list_text = list(df_tweets.text)


before, number of nan: 1
after, number of nan: 0


In [18]:
def preprocessing(list_text):
    word_list = []
    i=0
    for tweet in list_text:
        # remove urls
        tweet = re.sub(r"http\S+", "", tweet)
        
        df_tweets.at[i,'text'] =  tweet
        i=i+1
        #remove hashtags
        tweet = re.sub(r"#\S+", '', tweet)
        #remove mentions
        tweet = re.sub(r"@\S+", '', tweet)
        #tweet = " ".join(re.findall('[A-Z][^A-Z]*', tweet))
        tweet = re.sub('/', '', tweet)
        #remove emojis
        tweet = strip_emoji(tweet)
        # convert to lowercase and list of words
        word_list.extend(extract_words(tweet))

    #remove stop words    
    word_list = remove_stop_words(word_list)

    #standardize words
    #std_words = standardize_words(word_list)
    # stemming    
    lemmatized_words = lemmatize_words(word_list)
    return lemmatized_words

In [19]:
# parse html
# split attached words - DONE
# remove urls - DONE
# remove stop words - DONE
# remove emojis - DONE
# slang lookup
# standardizing words - DONE
# lemmatize/stemming - DONE
lemmatized_words = preprocessing(list_text)

In [20]:
df_tweets.head()

Unnamed: 0,created_at,entities,retweet_count,retweeted,text,keyword,hashtags,location,screen_name
0,2018-05-25,"{'hashtags': [{'indices': [51, 61], 'text': 'H...",0.0,False,@PressSec @Rambobiggs @TheSlyStallone just ano...,antigun,"['Hollywood', 'AntiGun', 'Anti2ndAmendment', '...",in the middle of nowhere USA,Hiwayman64
1,2018-05-25,"{'hashtags': [{'indices': [17, 25], 'text': 'a...",0.0,False,Chew on this you #antigun #liberals #shooting ...,antigun,"['antigun', 'liberals', 'shooting', 'DavidHogg']",,Infidel_Sniper
2,2018-05-25,"{'hashtags': [{'indices': [49, 65], 'text': 'I...",0.0,False,Shannon Watts of @Everytown Is a Left-Wing Fra...,antigun,['IAmForGunRights'],"Ohio, USA",MachewsMAGA
3,2018-05-25,"{'hashtags': [], 'symbols': [], 'urls': [], 'u...",2.0,False,RT @MachewsMAGA: Dying man's lawsuit claims Mo...,antigun,0,Eden,emalvini
4,2018-05-25,"{'hashtags': [], 'symbols': [], 'urls': [{'dis...",2.0,False,Dying man's lawsuit claims Monsanto covered up...,antigun,0,"Ohio, USA",MachewsMAGA


### Hashtag Counts

In [21]:
l_hashtags = []

for entity in df_tweets.entities:
    d = ast.literal_eval(entity)
    if d.get('hashtags', False):
        list_hashtags = [dict_ht['text'].lower() for dict_ht in d['hashtags']]
        l_hashtags.extend(list_hashtags)

In [22]:
# most common hashtags
counts = Counter(l_hashtags)

df = pd.DataFrame(counts.most_common(20), columns=['Hashtag', 'Count'])
df.to_csv('hashtag_counts.csv')

In [23]:
hashtag_df = pd.DataFrame.from_dict(list(dict(counts).items()))
hashtag_df.columns = ['keyword', 'count']
sorted_hashtag_df = hashtag_df.sort_values(by='count', ascending=False)

### Word Counts

In [24]:
w_counts = Counter(lemmatized_words)
df = pd.DataFrame(w_counts.most_common(100), columns=['Word', 'Count'])
df.to_csv('word_counts.csv')

### Retweets

In [25]:
df = pd.read_csv('Tweets_retweet_count.csv')
df.head()

Unnamed: 0,Unnamed: 31,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,geo,id,...,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,truncated,user,withheld_in_countries
0,,,,5/25/2018 4:30,"{'hashtags': [{'indices': [51, 61], 'text': 'H...",,0.0,False,,9.9987e+17,...,,,0.0,False,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",@PressSec @Rambobiggs @TheSlyStallone just ano...,False,"{'contributors_enabled': False, 'created_at': ...",
1,,,,5/25/2018 4:16,"{'hashtags': [{'indices': [17, 25], 'text': 'a...",,0.0,False,,9.99867e+17,...,1e+18,1e+18,0.0,False,,"<a href=""http://twitter.com/download/iphone"" r...",Chew on this you #antigun #liberals #shooting ...,True,"{'contributors_enabled': False, 'created_at': ...",
2,,,,5/25/2018 1:58,"{'hashtags': [{'indices': [49, 65], 'text': 'I...",,0.0,False,,9.99832e+17,...,,,0.0,False,,"<a href=""http://twitter.com/download/iphone"" r...",Shannon Watts of @Everytown Is a Left-Wing Fra...,True,"{'contributors_enabled': False, 'created_at': ...",
3,,,,5/25/2018 1:54,"{'hashtags': [], 'symbols': [], 'urls': [], 'u...",,0.0,False,,9.99831e+17,...,,,2.0,False,"{'contributors': None, 'coordinates': None, 'c...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @MachewsMAGA: Dying man's lawsuit claims Mo...,False,"{'contributors_enabled': False, 'created_at': ...",
4,,,,5/25/2018 1:54,"{'hashtags': [], 'symbols': [], 'urls': [], 'u...",,0.0,False,,9.99831e+17,...,,,2.0,False,"{'contributors': None, 'coordinates': None, 'c...","<a href=""http://twitter.com/download/iphone"" r...",RT @MachewsMAGA: Dying man's lawsuit claims Mo...,False,"{'contributors_enabled': False, 'created_at': ...",


In [26]:
df.text.isnull().sum()
df.text.fillna('', inplace=True)

In [27]:
df_rt = df[df.text.str.contains('^RT')]
df_rt.head()

Unnamed: 0,Unnamed: 31,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,geo,id,...,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,truncated,user,withheld_in_countries
3,,,,5/25/2018 1:54,"{'hashtags': [], 'symbols': [], 'urls': [], 'u...",,0.0,False,,9.99831e+17,...,,,2.0,False,"{'contributors': None, 'coordinates': None, 'c...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @MachewsMAGA: Dying man's lawsuit claims Mo...,False,"{'contributors_enabled': False, 'created_at': ...",
4,,,,5/25/2018 1:54,"{'hashtags': [], 'symbols': [], 'urls': [], 'u...",,0.0,False,,9.99831e+17,...,,,2.0,False,"{'contributors': None, 'coordinates': None, 'c...","<a href=""http://twitter.com/download/iphone"" r...",RT @MachewsMAGA: Dying man's lawsuit claims Mo...,False,"{'contributors_enabled': False, 'created_at': ...",
10,,,,5/24/2018 17:53,"{'hashtags': [{'indices': [27, 35], 'text': 'S...",,0.0,False,,9.9971e+17,...,,,3.0,False,"{'contributors': None, 'coordinates': None, 'c...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @Info4america: Anti-Gun #Senator Kevin De L...,False,"{'contributors_enabled': False, 'created_at': ...",
17,,,,5/24/2018 14:31,"{'hashtags': [{'indices': [44, 48], 'text': 'D...",,0.0,False,,9.99659e+17,...,,,1.0,False,"{'contributors': None, 'coordinates': None, 'c...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @Lastdocpa: @AuthorKimberley Perhaps the #D...,False,"{'contributors_enabled': False, 'created_at': ...",
21,,,,5/24/2018 3:40,"{'hashtags': [{'indices': [35, 43], 'text': 'a...",,0.0,False,,9.99495e+17,...,9.99e+17,9.99e+17,1.0,False,"{'contributors': None, 'coordinates': None, 'c...","<a href=""http://stopmadness.com/"" rel=""nofollo...",RT @LibtardsRCrazy: Off the rocker #antigun #l...,False,"{'contributors_enabled': False, 'created_at': ...",


In [28]:
rt_count = df_rt[['text', 'keyword']].groupby(['text', 'keyword']).size().reset_index()
rt_count.columns = ['text', 'keyword', 'count']
rt_count.sort_values(by = ['count'], ascending = False, inplace = True)

In [29]:
rt_count.head(20).to_csv('retweets.csv')

In [30]:
rt_count_1 = df_rt[['keyword']].groupby(['keyword']).size().reset_index()
rt_count_1.columns = ['keyword', 'count']
rt_count_1.sort_values(by = ['count'], ascending = False, inplace = True)

In [31]:
rt_count_1.to_csv('retweets_by_keywords.csv')

### Trump Tweets

In [123]:
df_trump_tweets = pd.read_csv('Trump_Tweets.csv')
text_data = df_trump_tweets['text']
text_data.fillna('', inplace=True)

list_text = list(text_data)

trump_words = preprocessing(list_text)

In [125]:
t_counts = Counter(trump_words)

df = pd.DataFrame(t_counts.most_common(100), columns=['T_Words', 'Count'])
df.to_csv('t_word_count.csv')

### LDA

In [177]:
def generate_df(model, feature_names, n_top_words, keyword):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        topic = [" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])]
        topic.append(topic_idx + 1)
        topics.append(topic)
    df = pd.DataFrame(topics)
    df.columns = ["topic", "topic_numer"]
    df["keyword"] = keyword
    return df

def lda_model(df, keyword, n_topic = 5, n_word = 5, max_features = 1000):
    '''model for latent dirichlect allocation'''
    lda = LatentDirichletAllocation(n_components=n_topic, max_iter=10, learning_method='online', learning_offset=10., random_state=42)
    tfid = TfidfVectorizer(max_df=0.95, min_df=3, max_features = max_features)
    tfidf_text = tfid.fit_transform(df[df.keyword == keyword].text)
    lda_text = lda.fit(tfidf_text)
    tfidf_feature_names = tfid.get_feature_names()
    lda_df = generate_df(lda_text, tfidf_feature_names, n_word, keyword)
    return lda_df

In [178]:
lda_nra_convention = lda_model(df_tweets, 'nraconvention', 4, 4)
print(lda_nra_convention)

                        topic  topic_numer        keyword
0                in the of it            1  nraconvention
1  at the realdonaldtrump you            2  nraconvention
2         remember was do the            3  nraconvention
3     nraconvention rt the to            4  nraconvention


In [179]:
lda_gunviolence = lda_model(df_tweets, 'gunviolence', 4, 4)
print(lda_gunviolence)

                                  topic  topic_numer      keyword
0  gunviolence santafe guncontrolnow rt            1  gunviolence
1                        the and to for            2  gunviolence
2  nra realdonaldtrump gunreformnow gop            3  gunviolence
3                          the to of in            4  gunviolence


In [180]:
lda_antigun = lda_model(df_tweets, 'antigun', 4, 4)
print(lda_antigun)

                           topic  topic_numer  keyword
0  this nra antigun 2ndamendment            1  antigun
1                to about the as            2  antigun
2             to that so antigun            3  antigun
3     2adefenders is and antigun            4  antigun


In [181]:
lda_school_shooting = lda_model(df_tweets, 'IfIdieInASchoolShooting', 4, 4)
print(lda_school_shooting)

                                    topic  topic_numer  \
0                       never to get will            1   
1       my it and ifidieinaschoolshooting            2   
2      ifidieinaschoolshooting my will be            3   
3  the is ifidieinaschoolshooting hashtag            4   

                   keyword  
0  IfIdieInASchoolShooting  
1  IfIdieInASchoolShooting  
2  IfIdieInASchoolShooting  
3  IfIdieInASchoolShooting  


In [182]:
lda_guncontrol = lda_model(df_tweets, 'guncontrol', 4, 4)
print(lda_guncontrol)

                                           topic  topic_numer     keyword
0  guncontrol guncontrolnow rt santafehighschool            1  guncontrol
1      realdonaldtrump santafe guncontrol enough            2  guncontrol
2                         the school in shooting            3  guncontrol
3                                  to the you we            4  guncontrol


In [183]:
lda_nra = lda_model(df_tweets, 'nra', 4, 4)
print(lda_nra)

               topic  topic_numer keyword
0     the to you and            1     nra
1     nra 2a gun new            2     nra
2    nra in the here            3     nra
3  maga rt trump nra            4     nra


In [184]:
lda_df = pd.concat([lda_nra_convention, lda_gunviolence, lda_antigun, lda_school_shooting, lda_guncontrol, lda_nra])
lda_df.to_csv('lda_data.csv')