# Introduction to Artificial Intelligence Final Project
## By Noah Segal-Gould and Tanner Cohan

### To implement:
[This article](http://brandonrose.org/clustering)

[Modified version of above](https://github.com/toyota790/Twitter_PanamaPapers_Analysis)

#### Import libraries

In [1]:
import pandas as pd
from collections import Counter
from glob import glob
from os.path import basename, splitext
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.externals import joblib

#### Create lists of file names for all Twitter account CSVs

In [2]:
house_accounts_filenames = glob("house/*.csv")

In [3]:
senate_accounts_filenames = glob("senate/*.csv")

#### Create lists of all dataframes for all CSVs

In [4]:
house_accounts_dataframes = [pd.read_csv(filename).assign(account="@" + splitext(basename(filename))[0]) 
                             for filename in house_accounts_filenames]

In [5]:
senate_accounts_dataframes = [pd.read_csv(filename).assign(account="@" + splitext(basename(filename))[0])
                              for filename in senate_accounts_filenames]

#### Find which 10 Tweets were most Retweeted and Favorited in each list of dataframes

In [6]:
most_retweets_house_accounts_dataframes = [df.sort_values('Retweets').tail(25)
                                           for df in house_accounts_dataframes]
#[df.iloc[[df['Retweets'].idxmax()]] 

In [7]:
most_favorites_house_accounts_dataframes = [df.sort_values('Favorites').tail(25)
                                            for df in house_accounts_dataframes]
#[df.iloc[[df['Favorites'].idxmax()]]

In [8]:
most_retweets_senate_accounts_dataframes = [df.sort_values('Retweets').tail(25)
                                            for df in senate_accounts_dataframes]
#[df.iloc[[df['Retweets'].idxmax()]]

In [9]:
most_favorites_senate_accounts_dataframes = [df.sort_values('Favorites').tail(25)
                                             for df in senate_accounts_dataframes]
#[df.iloc[[df['Favorites'].idxmax()]]

#### Create dataframes of the most Retweeted and Favorited Tweets for each account

In [10]:
most_retweets_congress_dataframe = pd.concat(most_retweets_house_accounts_dataframes + most_retweets_senate_accounts_dataframes).reset_index(drop=True)

In [11]:
most_favorites_congress_dataframe = pd.concat(most_favorites_house_accounts_dataframes + most_favorites_senate_accounts_dataframes).reset_index(drop=True)

#### Show the Retweets dataframe

In [12]:
most_retweets_congress_dataframe.sort_values('Retweets').tail()

Unnamed: 0,Text,Date,Favorites,Retweets,Tweet ID,account
11726,"Hey Republicans, don't worry, that burn is cov...",2017-03-24 19:53:43,310324,143726,845363015222542336,@SenatorMenendez
11276,It's a shame the White House has become an adu...,2017-10-08 15:13:43,419380,148639,917045348820049920,@SenBobCorker
10624,".@realDonaldTrump, you are embarrassing our co...",2017-08-15 22:06:45,562982,199780,897580346379829250,@SenSanders
10625,.@realDonaldTrump They did. It wasn't.pic.twit...,2017-02-25 13:56:12,518361,206269,835488569850494976,@SenSanders
10626,"President Trump, you made a big mistake. By tr...",2017-01-21 22:15:24,972101,452896,822930622926745602,@SenSanders


#### Show the Favorites dataframe

In [13]:
most_favorites_congress_dataframe.sort_values('Favorites').tail()

Unnamed: 0,Text,Date,Favorites,Retweets,Tweet ID,account
11726,"Hey Republicans, don't worry, that burn is cov...",2017-03-24 19:53:43,310324,143726,845363015222542336,@SenatorMenendez
11276,It's a shame the White House has become an adu...,2017-10-08 15:13:43,419380,148639,917045348820049920,@SenBobCorker
10624,.@realDonaldTrump They did. It wasn't.pic.twit...,2017-02-25 13:56:12,518361,206269,835488569850494976,@SenSanders
10625,".@realDonaldTrump, you are embarrassing our co...",2017-08-15 22:06:45,562982,199780,897580346379829250,@SenSanders
10626,"President Trump, you made a big mistake. By tr...",2017-01-21 22:15:24,972101,452896,822930622926745602,@SenSanders


#### Combine all House of Representatives' accounts, all Senators' accounts, and then combine them together into all Congress accounts

In [14]:
house_dataframe = pd.concat(house_accounts_dataframes)

In [15]:
senate_dataframe = pd.concat(senate_accounts_dataframes)

In [16]:
congress_dataframe = pd.concat([house_dataframe, senate_dataframe]).reset_index(drop=True)

#### Remove columns with missing values

In [17]:
congress_dataframe.dropna(inplace=True)

#### Print some statistics

In [18]:
print("Total number of Tweets for all accounts: " + str(len(congress_dataframe)))
print("Total number of accounts: " + str(len(set(congress_dataframe["account"]))))
print("Total number of house members: " + str(len(set(house_dataframe["account"]))))
print("Total number of senators: " + str(len(set(senate_dataframe["account"]))))

Total number of Tweets for all accounts: 1614705
Total number of accounts: 524
Total number of house members: 424
Total number of senators: 100


#### Get NLTK English stopwords

In [19]:
stopwords = stopwords.words('english')

#### Instantiate SnowballStemmer as stemmer

In [20]:
stemmer = SnowballStemmer("english")

#### Load NLTK's Tweet Tokenizer

In [21]:
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

#### Define a function to remove hashtags, mentions, and URLs

In [22]:
def clean_word(word):
    return all([("#" not in word), 
                ("@" not in word), 
                ("." not in word), 
                (word.isalpha()), 
                (word not in stopwords)])

#### Define a function to return the list of stemmed words and the list of tokens which have been stripped of non-alphabetical characters and stopwords

In [23]:
def tokenize_and_stem(text):
    tokens = tokenizer.tokenize(text)
    filtered_tokens = [word for word in tokens if clean_word(word)]
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    tokens = tokenizer.tokenize(text)
    filtered_tokens = [word.lower() for word in tokens if clean_word(word)]
    return filtered_tokens

#### Define a function for getting lists of stemmed and tokenized Tweets

In [24]:
def get_stemmed_and_tokenized_dict(tweets):
    stemmed = []
    tokenized = []
    for tweet in tweets:
        stemmed.extend(tokenize_and_stem(tweet))
        tokenized.extend(tokenize_only(tweet))
    return {"Stemmed": stemmed, "Tokenized": tokenized}

#### Apply function to Tweets

In [25]:
%time stemmed_and_tokenized_dict = get_stemmed_and_tokenized_dict(most_favorites_congress_dataframe["Text"])

CPU times: user 8.19 s, sys: 94.3 ms, total: 8.28 s
Wall time: 8.41 s


#### Create a dataframe of stemmed and tokenized words

In [26]:
vocab_frame = pd.DataFrame({'words': stemmed_and_tokenized_dict["Tokenized"]}, 
                           index = stemmed_and_tokenized_dict["Stemmed"])

In [27]:
print("There are " + str(vocab_frame.shape[0]) + " items in vocab_frame")

There are 133507 items in vocab_frame


In [28]:
vocab_frame.head()

Unnamed: 0,words
congrat,congrats
trump,trump
make,making
deal,deal
get,gets


#### Set up TF-IDF vectorizer from Scikit Learn and also apply the vectorizer to the Tweets

In [29]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, sublinear_tf=True,
                                   min_df=0.001, stop_words='english',
                                   use_idf=True, tokenizer=tokenize_and_stem)

In [30]:
%time tfidf_matrix = tfidf_vectorizer.fit_transform(most_favorites_congress_dataframe["Text"])

CPU times: user 5.8 s, sys: 64.5 ms, total: 5.86 s
Wall time: 5.92 s


In [31]:
print(tfidf_matrix.shape)

(13077, 1398)


In [32]:
terms = tfidf_vectorizer.get_feature_names()

In [33]:
dist = 1 - cosine_similarity(tfidf_matrix)

#### Set up K-Means clustering

In [34]:
num_clusters = 5

In [35]:
km = KMeans(n_clusters=num_clusters)

In [36]:
%time km.fit(tfidf_matrix)

CPU times: user 15.3 s, sys: 195 ms, total: 15.5 s
Wall time: 16.1 s


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [37]:
clusters = km.labels_.tolist()

#### Alternatively use K-Means++

In [38]:
#num_clusters = 5

In [39]:
#km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=1, verbose=1)

In [40]:
#%time km.fit(tfidf_matrix)

In [41]:
#clusters = km.labels_.tolist()

#### Save/load the model

In [42]:
joblib.dump(km, 'doc_cluster.pkl')

['doc_cluster.pkl']

In [43]:
#km = joblib.load('doc_cluster.pkl')
#clusters = km.labels_.tolist()

#### Create new dataframe for easy access of accounts which apply to clusters

In [44]:
tweets = {'Account': most_favorites_congress_dataframe["account"], 
          'Text': most_favorites_congress_dataframe["Text"], 
          'cluster': clusters}

In [45]:
frame = pd.DataFrame(tweets, columns = ['Account', 'Text', 'cluster'])

#### Number of Tweets per cluster (clusters from 0 to 4)

In [46]:
frame['cluster'].value_counts()

4    8693
3    1306
1    1303
0    1086
2     689
Name: cluster, dtype: int64

#### Show the top 10 keywords and the accounts for each cluster

In [47]:
top_n_words = 10

In [48]:
top_n_accounts = 5

In [49]:
print("Top terms and accounts per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
for i in range(num_clusters):
    print("Cluster %s words: " % str(i+1) + ", ".join([vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0] 
                                                      for ind in order_centroids[i, :top_n_words]]))
    print("Cluster " + str(i+1) + " top " + str(top_n_accounts) + " accounts: ", end='')
    print(", ".join([account for account, value in Counter(frame[frame["cluster"] == i]["Account"]).most_common(top_n_accounts)]))
    print()

Top terms and accounts per cluster:

Cluster 1 words: american, people, deserve, tax, health, millions, care, act, voted, today
Cluster 1 top 5 accounts: @RepMcGovern, @RepWalorski, @MikeKellyPA, @RepBarragan, @Jim_Jordan

Cluster 2 words: voted, house, floor, passed, today, gun, white, senator, act, colleagues
Cluster 2 top 5 accounts: @RepJohnDelaney, @RepTomGraves, @RepEdRoyce, @RepJBridenstine, @RepJohnLarson

Cluster 3 words: thank, today, served, work, service, honoring, great, happy, day, office
Cluster 3 top 5 accounts: @MaxineWaters, @SanfordBishop, @RepJenniffer, @RepChuck, @RepDennisRoss

Cluster 4 words: president, trump, great, mr, today, meet, news, statement, making, happy
Cluster 4 top 5 accounts: @RepAlGreen, @RepAdamSchiff, @SenSanders, @RepDonaldPayne, @RepKarenBass

Cluster 5 words: today, need, proud, work, stand, support, family, joining, w, tax
Cluster 5 top 5 accounts: @RepRWilliams, @RepAndyBiggsAZ, @RepBrianFitz, @RepRaskin, @SenAlexander

