# Check (That Tweet) Yo Self 
## Prioritizing Tweets to Fact Check
###### Part 9: Cluster NLP EDA
This notebook looks at the top words and phrases for each cluster

Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import time
import warnings
import regex as re
import seaborn as sns
import re
import statistics

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.linear_model import Ridge 
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from nltk.sentiment.vader import SentimentIntensityAnalyzer
warnings.filterwarnings('ignore')
np.random.seed(824)
from bs4 import BeautifulSoup 

# Import stopwords.
from nltk.corpus import stopwords # Import the stopword list
import nltk

from tweetscrape.users_scrape import TweetScrapperUser

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.style.use('fivethirtyeight')

Read in csv

In [2]:
tweet = pd.read_csv('../data/user_cluster_tweets.csv')

In [3]:
tweet.head()

Unnamed: 0,id,time,author,author_id,associated_tweet,text,links,hashtags,mentions,reply_count,...,ratio,has_url,has_location,has_bio,len_bio,ratio_num_user,emotional_range,user_group_db,user_group,target
0,1254190074595553281,2020-04-25 16:26:30,Iam_helenna,215204985,1254190074595553281,"Today, we have 1182 cases in Nigeria with 35 d...",[],[''],[''],37,...,1.357576,0,0,1,147,0.0,0.0195,0,0,289
1,1253828209075990531,2020-04-24 16:28:34,KerryeHill,2807727004,1253697753479331840,There's no such thing as a medical disinfectan...,[],[''],[''],1,...,0.241706,0,0,1,89,0.0,0.014,0,0,3
2,1253460644294283265,2020-04-23 16:08:00,Lmt48430438,1232381432988930049,1253460644294283265,Waiting to see how many people drink disinfect...,[],[''],['@DarcysCartoon'],1,...,0.357143,0,0,0,3,0.727273,0.0,1,2,1
3,1254194987945865217,2020-04-25 16:46:01,iamshollyyoung,3096323025,1254194987945865217,Today I know there's no result Nigeria can not...,['https://t.co/RRuHGBH1SI'],['#Covid_19'],[''],1,...,0.470899,1,1,1,103,0.0,0.0216,2,4,55
4,1253835841685934081,2020-04-24 16:58:54,toddcusuman,588727638,1253835841685934081,New York rapper Fred the Godson dies at 35 aft...,['https://t.co/rXOi5YEoZl'],[''],[''],0,...,0.077656,0,0,0,3,0.0,0.0,1,2,0


Additional cleaning to the clean tweets column with custom stopwords that are significant in number but hold no contextual value

In [4]:
# sklearn's stopwords, extracted
sklearn_stopwords = list(CountVectorizer(stop_words = 'english').get_stop_words())
#Custom created list
custom_stopwords = ['com',
                    'twitter',
                    'pic',
                    'http',
                    'isolation',
                    'pandemic',
                    'covid',
                    'quarantine',
                    'vaccine',
                    'coronavirus',
                    'lysol',
                    'ingest',
                    'inject',
                    'disinfectant',
                    'bleach',
                    'don',
                    've']
# Personalized stopwords
personal_stopwords = sklearn_stopwords + custom_stopwords

In [5]:
def tweet_to_words(raw_tweet):
    
    # 1. Remove HTML.
    tweet_text = BeautifulSoup(raw_tweet).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", tweet_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. Convert personalized stopwords to set
    stops = set(personal_stopwords)

    # 5. Remove stopwords.
    meaningful_words = [w for w in words if w not in stops]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

In [6]:
# Get the number of tweets based on the dataframe size.
total_tweets = tweet.shape[0]
print(f'There are {total_tweets} tweets related to coronavirus.')

There are 33199 tweets related to coronavirus.


In [7]:
# Initialize an empty list to hold the clean tweets.
clean_tweets = []

print("Cleaning and parsing twitter data...")

# Instantiate counter.
j = 0

# For every post in our training set...
for string in tweet['text_links_removed']:
    
    # Convert post to words, then append to clean_train_posts.
    clean_tweets.append(tweet_to_words(string))
    
    # If the index is divisible by 1000, print a message.
    if (j + 1) % 5000 == 0:
        print(f'Tweet {j + 1} of {total_tweets}.')
    
    j += 1 

Cleaning and parsing twitter data...
Tweet 5000 of 33199.
Tweet 10000 of 33199.
Tweet 15000 of 33199.
Tweet 20000 of 33199.
Tweet 25000 of 33199.
Tweet 30000 of 33199.


In [8]:
tweet = tweet.assign(clean_text = clean_tweets)

In [9]:
tweet.groupby(['user_group_db']).mean().sort_values(['target'])

Unnamed: 0_level_0,id,author_id,associated_tweet,reply_count,favorite_count,retweet_count,not_english,hashtag_count,mention_count,word_count,...,big_feelings,ratio,has_url,has_location,has_bio,len_bio,ratio_num_user,emotional_range,user_group,target
user_group_db,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.253942e+18,4.277929e+17,1.253449e+18,0.380268,2.052272,0.273605,0.260551,0.175653,0.14575,25.222745,...,0.2621,1.127193,0.0,0.0,0.0,3.0,0.139479,0.019135,2.0,2.706146
2,1.253957e+18,5.748085e+17,1.253366e+18,0.904011,5.859303,1.370809,0.278667,0.402367,0.202498,26.662722,...,0.267239,2.81375,1.0,1.0,1.0,48.174227,0.106509,0.018795,4.0,8.134122
4,1.253938e+18,5.092492e+17,1.253147e+18,0.587104,7.983032,2.361425,0.271795,0.326923,0.187783,26.228507,...,0.26214,1.995957,0.0,1.0,1.0,87.115385,0.107851,0.019189,4.0,10.931561
0,1.253943e+18,4.373013e+17,1.252977e+18,0.851322,16.968023,3.998599,0.268508,0.263674,0.147034,25.194177,...,0.263564,1.621993,0.0,0.0,1.0,86.773359,0.09209,0.019169,0.0,21.817944
3,1.253964e+18,2.590995e+17,1.253206e+18,1.709344,23.609235,5.484609,0.284257,0.422501,0.1772,25.865568,...,0.267071,11.895564,1.0,0.0,1.0,105.836012,0.033074,0.019445,1.0,30.803187
-1,1.253928e+18,2.153111e+17,1.253079e+18,6.722018,114.163952,27.337888,0.291443,0.291645,0.177614,25.036784,...,0.276124,723.734861,0.459275,0.304256,0.58907,71.158697,0.090221,0.01864,3.658434,148.223857


In [10]:
db_outlier = tweet[tweet['user_group_db'] == -1]
db_0 = tweet[tweet['user_group_db'] == 0]
db_1 = tweet[tweet['user_group_db'] == 1]
db_2 = tweet[tweet['user_group_db'] == 2]
db_3 = tweet[tweet['user_group_db'] == 3]
db_4 = tweet[tweet['user_group_db'] == 4]

In [11]:
def top_trends(df, column, ngram_min, ngram_max, top_word_count):
    cvec = CountVectorizer(stop_words = 'english', min_df=1, max_df=0.25, ngram_range=(ngram_min, ngram_max))
    term_mat = cvec.fit_transform(df[column])
    print(f'Number of unique items: {len(cvec.get_feature_names())}')
    print()
    term_df = pd.DataFrame(term_mat.toarray(), columns=cvec.get_feature_names())
    top_words = pd.DataFrame(term_df.mean().sort_values(ascending = False).head(top_word_count), columns = ['Frequency'])
    return top_words

### Grabbing Phrases for Entire Dataframe

In [17]:
tweet_one = top_trends(tweet, 'clean_text', 1, 1, 20)
tweet_one.to_csv('../data/tweet_one.csv')

Number of unique items: 39641



In [19]:
tweet_two = top_trends(tweet, 'clean_text', 2, 2, 20)
tweet_two.to_csv('../data/tweet_two.csv')

Number of unique items: 255173



In [21]:
tweet_three = top_trends(tweet, 'clean_text', 3, 3, 20)
tweet_three.to_csv('../data/tweet_three.csv')

Number of unique items: 288481



In [23]:
tweet_four = top_trends(tweet, 'clean_text', 4, 4, 20)
tweet_four.to_csv('../data/tweet_four.csv')

Number of unique items: 265656



### Phrases for Outlier Group

In [25]:
outlier_one = top_trends(db_outlier, 'clean_text', 1, 1, 20)
outlier_one.to_csv('../data/outlier_one.csv')

Number of unique items: 7063



In [27]:
outlier_two = top_trends(db_outlier, 'clean_text', 2, 2, 20)
outlier_two.to_csv('../data/outlier_two.csv')

Number of unique items: 17765



In [29]:
outlier_three = top_trends(db_outlier, 'clean_text', 3, 3, 20)
outlier_three.to_csv('../data/outlier_three.csv')

Number of unique items: 17210



In [31]:
outlier_four = top_trends(db_outlier, 'clean_text', 4, 4, 20)
outlier_four.to_csv('../data/outlier_four.csv')

Number of unique items: 15627



### Phrases for our main priority to focus in on when identifying fake news tweets (group 3)

In [33]:
concern_one = top_trends(db_3, 'clean_text', 1, 1, 20)
concern_one.to_csv('../data/concern_one.csv')

Number of unique items: 18409



In [35]:
concern_two = top_trends(db_3, 'clean_text', 2, 2, 20)
concern_two.to_csv('../data/concern_two.csv')

Number of unique items: 68603



In [37]:
concern_three = top_trends(db_3, 'clean_text', 3, 3, 20)
concern_three.to_csv('../data/concern_three.csv')

Number of unique items: 68803



In [39]:
concern_four = top_trends(db_3, 'clean_text', 4, 4, 20)
concern_four.to_csv('../data/concern_four.csv')

Number of unique items: 62680



In [7]:
tweet.groupby(['user_group_db']).mean().sort_values(['target'])

Unnamed: 0_level_0,id,author_id,associated_tweet,reply_count,favorite_count,retweet_count,not_english,hashtag_count,mention_count,word_count,...,big_feelings,ratio,has_url,has_location,has_bio,len_bio,ratio_num_user,emotional_range,user_group,target
user_group_db,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.253942e+18,4.277929e+17,1.253449e+18,0.380268,2.052272,0.273605,0.260551,0.175653,0.14575,25.222745,...,0.2621,1.127193,0.0,0.0,0.0,3.0,0.139479,0.019135,2.0,2.706146
2,1.253957e+18,5.748085e+17,1.253366e+18,0.904011,5.859303,1.370809,0.278667,0.402367,0.202498,26.662722,...,0.267239,2.81375,1.0,1.0,1.0,48.174227,0.106509,0.018795,4.0,8.134122
4,1.253938e+18,5.092492e+17,1.253147e+18,0.587104,7.983032,2.361425,0.271795,0.326923,0.187783,26.228507,...,0.26214,1.995957,0.0,1.0,1.0,87.115385,0.107851,0.019189,4.0,10.931561
0,1.253943e+18,4.373013e+17,1.252977e+18,0.851322,16.968023,3.998599,0.268508,0.263674,0.147034,25.194177,...,0.263564,1.621993,0.0,0.0,1.0,86.773359,0.09209,0.019169,0.0,21.817944
3,1.253964e+18,2.590995e+17,1.253206e+18,1.709344,23.609235,5.484609,0.284257,0.422501,0.1772,25.865568,...,0.267071,11.895564,1.0,0.0,1.0,105.836012,0.033074,0.019445,1.0,30.803187
-1,1.253928e+18,2.153111e+17,1.253079e+18,6.722018,114.163952,27.337888,0.291443,0.291645,0.177614,25.036784,...,0.276124,723.734861,0.459275,0.304256,0.58907,71.158697,0.090221,0.01864,3.658434,148.223857


### 2nd priority concern (group 0)

In [13]:
priority_2_one = top_trends(db_0, 'clean_text', 1, 1, 5)
priority_2_one.to_csv('../data/priority_2_one.csv')

Number of unique items: 24268



In [14]:
priority_2_four = top_trends(db_0, 'clean_text', 4, 4, 5)
priority_2_four.to_csv('../data/priority_2_four.csv')

Number of unique items: 129028



### 3rd priority concern (group 4)

In [15]:
priority_3_one = top_trends(db_4, 'clean_text', 1, 1, 5)
priority_3_one.to_csv('../data/priority_3_one.csv')

Number of unique items: 6455



In [16]:
priority_3_four = top_trends(db_4, 'clean_text', 4, 4, 5)
priority_3_four.to_csv('../data/priority_3_four.csv')

Number of unique items: 14883



### 4th priority concern (group 2)

In [17]:
priority_4_one = top_trends(db_2, 'clean_text', 1, 1, 5)
priority_4_one.to_csv('../data/priority_4_one.csv')

Number of unique items: 6511



In [18]:
priority_4_four = top_trends(db_2, 'clean_text', 4, 4, 5)
priority_4_four.to_csv('../data/priority_4_four.csv')

Number of unique items: 13706



### 5th priority concern (group 1)

In [19]:
priority_5_one = top_trends(db_1, 'clean_text', 1, 1, 5)
priority_5_one.to_csv('../data/priority_5_one.csv')

Number of unique items: 9799



In [20]:
priority_5_four = top_trends(db_1, 'clean_text', 4, 4, 5)
priority_5_four.to_csv('../data/priority_5_four.csv')

Number of unique items: 33284



# Round em up!
Putting all the phrases for each group back into one DataFrames for each phrase length.

In [12]:
outlier_one = pd.read_csv('../data/outlier_one.csv')
outlier_four = pd.read_csv('../data/outlier_four.csv')
concern_one = pd.read_csv('../data/concern_one.csv')
concern_four = pd.read_csv('../data/concern_four.csv')
priority_2_one = pd.read_csv('../data/priority_2_one.csv')
priority_2_four = pd.read_csv('../data/priority_2_four.csv')
priority_3_one = pd.read_csv('../data/priority_3_one.csv')
priority_3_four = pd.read_csv('../data/priority_3_four.csv')
priority_4_one = pd.read_csv('../data/priority_4_one.csv')
priority_4_four = pd.read_csv('../data/priority_4_four.csv')
priority_5_one = pd.read_csv('../data/priority_5_one.csv')
priority_5_four = pd.read_csv('../data/priority_5_four.csv')

all_tweets_one = pd.read_csv('../data/tweet_one.csv')
all_tweets_four = pd.read_csv('../data/tweet_four.csv')

In [13]:
outlier_one = outlier_one.head()
outlier_four = outlier_four.head()
concern_one = concern_one.head()
concern_four = concern_four.head()

In [14]:
outlier_one = outlier_one.rename(columns = { 'Unnamed: 0' : 'Outlier'})        
concern_one = concern_one.rename(columns = { 'Unnamed: 0' : '1st Priority'})        
priority_2_one = priority_2_one.rename(columns = { 'Unnamed: 0' : '2nd Priority'}) 
priority_3_one = priority_3_one.rename(columns = { 'Unnamed: 0' : '3rd Priority'}) 
priority_4_one = priority_4_one.rename(columns = { 'Unnamed: 0' : '4th Priority'}) 
priority_5_one = priority_5_one.rename(columns = { 'Unnamed: 0' : '5th Priority'}) 


In [15]:
outlier_four = outlier_four.rename(columns = { 'Unnamed: 0' : 'Outlier'})             
concern_four = concern_four.rename(columns = { 'Unnamed: 0' : '1st Priority'})            
priority_2_four = priority_2_four.rename(columns = { 'Unnamed: 0' : '2nd Priority'})
priority_3_four = priority_3_four.rename(columns = { 'Unnamed: 0' : '3rd Priority'}) 
priority_4_four = priority_4_four.rename(columns = { 'Unnamed: 0' : '4th Priority'})
priority_5_four = priority_5_four.rename(columns = { 'Unnamed: 0' : '5th Priority'}) 

In [16]:
outlier_one

Unnamed: 0,Outlier,Frequency
0,trump,0.207042
1,people,0.140305
2,just,0.071466
3,like,0.069364
4,virus,0.05938


In [17]:
top_one_word = outlier_one[['Outlier']]
top_one_word['1st Priority'] = concern_one['1st Priority']
top_one_word['2nd Priority'] = priority_2_one['2nd Priority']
top_one_word['3rd Priority'] = priority_3_one['3rd Priority']
top_one_word['4th Priority'] = priority_4_one['4th Priority']
top_one_word['5th Priority'] = priority_5_one['5th Priority']

In [32]:
top_one_word.to_csv('../data/clusters_top_one_word.csv', index = False)

In [34]:
top_one_word

Unnamed: 0,Outlier,1st Priority,2nd Priority,3rd Priority,4th Priority,5th Priority
0,trump,people,people,people,people,people
1,people,trump,trump,trump,trump,trump
2,just,just,just,just,just,just
3,like,like,like,like,like,said
4,virus,time,said,drink,virus,like


Above are the top words for each cluster.

In [19]:
top_four_word = outlier_four[['Outlier']]
top_four_word['1st Priority'] = concern_four['1st Priority']
top_four_word['2nd Priority'] = priority_2_four['2nd Priority']
top_four_word['3rd Priority'] = priority_3_four['3rd Priority']
top_four_word['4th Priority'] = priority_4_four['4th Priority']
top_four_word['5th Priority'] = priority_5_four['5th Priority']

In [33]:
top_four_word.to_csv('../data/clusters_top_four_word.csv', index = False)

In [35]:
top_four_word

Unnamed: 0,Outlier,1st Priority,2nd Priority,3rd Priority,4th Priority,5th Priority
0,stop panic end total,stop panic end total,like injection inside cleaning,way like injection inside,group peddling cure wrote,stop panic end total
1,data stop panic end,data stop panic end,way like injection inside,like injection inside cleaning,people actually exactly caution,way like injection inside
2,cure wrote trump week,cure wrote trump week,minute way like injection,minute way like injection,meant people actually exactly,data stop panic end
3,group peddling cure wrote,group peddling cure wrote,stop panic end total,minute minute way like,leader group peddling cure,like injection inside cleaning
4,leader group peddling cure,need personal mug check,data stop panic end,knocks minute minute way,hot cups hot coffee,minute way like injection


These are the top 4 word phrases for each cluster.

In [21]:
all_tweets_one = pd.read_csv('../data/tweet_one.csv')

In [24]:
all_tweets_one = all_tweets_one.head(10)

In [27]:
all_tweets_four = all_tweets_four.head(10)

In [37]:
all_tweets_one = all_tweets_one.rename(columns = { 'Unnamed: 0' : 'Top Words'})

In [38]:
all_tweets_four = all_tweets_four.rename(columns = { 'Unnamed: 0' : 'Top Groups of Words'})

In [39]:
top_tweet_stats = all_tweets_one[['Top Words']]
top_tweet_stats['Top Groups of Words'] = all_tweets_four['Top Groups of Words']

In [40]:
top_tweet_stats

Unnamed: 0,Top Words,Top Groups of Words
0,people,stop panic end total
1,trump,data stop panic end
2,just,way like injection inside
3,like,like injection inside cleaning
4,said,minute way like injection
5,drink,knocks minute minute way
6,know,minute minute way like
7,virus,cure wrote trump week
8,president,peddling cure wrote trump
9,think,group peddling cure wrote


These are the top overall words and phrases for all tweets.