## Part 1

### Import Statements

In [1]:
import numpy as np
import pandas as pd
import csv
import nltk
import re
import os
import gensim
from gensim.models import Word2Vec
from nltk.data import find
import codecs
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
stopwords = nltk.corpus.stopwords.words('english')

### Stemmer for stemming words

In [3]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

### Data Consolidation

In [4]:
data = pd.read_csv('Improved Hillary Tweets.csv')
data1 = pd.read_csv('Improved Trump Tweets.csv')
data.append(data1)

Unnamed: 0.1,Unnamed: 0,clean_tweets,favorite,followers,friends,hash_tags,ids,inreplyto,listed,location,retweet,screen_name,state_abbs,text
0,0,No privatization of SS I've paid in I want min...,0,368,1629,[],,,5,"Davenport, IA",0,curish,IA,"No privatization of SS; I've paid in, I want m..."
1,1,@timkaine is right Trump would increase debt m...,0,7033,3906,"['#debates', '#debates']",,,343,"Washington, DC",2,BudgetHawks,DC,.@timkaine is right. Trump would increase debt...
2,2,Hillary Clinton's uninformed know that Hillary...,0,505,529,['#BasementDwellers'],,,115,"Barrington, RI",260,Debbierg51,RI,"RT @Ian56789: Hillary Clinton's ""uninformed"" #..."
3,3,Hillary Clinton ’ s economic plan would send o...,0,43,196,['#BigLeagueTruth'],,,0,,427,sandrapelton,,RT @mike_pence: Hillary Clinton’s economic pla...
4,4,Do you want a You're Hired president in Hillar...,0,25,263,[],,,0,"Utah, United States",13,seb_mar,UT,"RT @Fusion: ""Do you want a You're Hired presid..."
5,5,@HillaryClinton s proud of Clinton Foundation ...,0,723,1887,['#Trump'],,,63,"Tustin, CA",4,MattM3502,CA,RT @XavierBecerra: .@HillaryClinton's proud of...
6,6,CLINTONS STOLE 94 OF THE DONATIONS TO THAT COU...,0,3181,2961,[],,,142,,1,KayWalker10,,RT @LAYNALANGUAGE: @KayWalker10 @cfoundationsr...
7,7,Pence claimed Clinton's plans would raise the ...,0,414,793,[],,,5,Michigan,5,Cycle4,MI,RT @KevinMKruse: Pence claimed Clinton's plans...
8,8,If past voting records are fair game how about...,0,98,104,['#VPDebate'],,,2,Texas,0,RobbieNava,TX,"If past voting records are fair game, how abou..."
9,9,"there are 359,000 more black voters in battleg...",0,213,34,[],,,51,,282,kevtalmadge,,"RT @JoyAnnReid: ""...there are 359,000 more bla..."


### Removing Invalid States and Creating A Dictionary with key = state name and value = all tweets for the state

In [5]:
states=set(list(data.state_abbs))
states=list(states)
states=[str(state) for state in states]
states.remove('nan')

In [6]:
dictdata={}
for state in states:
    statedata=data.loc[data['state_abbs'] == state]
    listoftweets=list(statedata['clean_tweets'])
    dictdata[state] = ' '.join(listoftweets)

In [7]:
dictdata['AK']

"Bill and Hillary Clinton Foundation Received 81 Million In Donations Through Swiss Bank Under Investigation Hillary Clinton ’ s economic plan would send our economy into a tailspin When Hillary Clinton became Secretary of State Chris Stevens Ty Woods Sean Smith Glen Doherty were still alive … One thing's clear and Hillary have hardworking Americans backs The VP's office is in good hands – Joe Can't wait for Hillary supporters who have a problem with Trump's demeanor attempt to defend Tim Kaine's obnoxious … Its as hard to watch Kaine as it is to watch hillary Its as hard to watch Kaine as it is to watch hillary FLASHBACK Hillary Clinton's Failed Foreign Policy ➡ ️ SYRIA FLASHBACK Hillary Clinton's Failed Foreign Policy ➡ ️ SYRIA FLASHBACK Hillary Clinton's Failed Foreign Policy ➡ ️ SYRIA FLASHBACK Hillary Clinton's Failed Foreign Policy ➡ ️ SYRIA"

### Keeping Only Nouns Within Tweets For The Purpose of Clustering

In [8]:
for key in list(dictdata.keys()):
    noundata=dictdata[key]
    tokens=nltk.word_tokenize(noundata)
    noundata=[word[0] for word in nltk.pos_tag(tokens) if word[1] == 'NN']
    noundata=' '.join(noundata)
    dictdata[key]=noundata

In [9]:
statename=[]
tweetdata=[]
for key in dictdata.keys():
    statename.append(key)
    tweetdata.append(dictdata[key])

Note: The code beyond this point is a modified implementation of K-means clustering as made available to general public by Brandon Rose. The original implementation may be found at brandonrose.org/clustering

### Tokenization

In [10]:
def tokenize(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        pattern = r'''(?x)  # allow verbose regexps
        (?:[A-Z]\.)+        # abbreviations
        |\w+(?:[-']\w+)*    # words with optional internal hyphens
        |\$?\d+(?:\.\d+)?   # currency
        |\.\.\.             # elipses
        |[.,;"'?()-_`]      # these are separate tokens
        '''
        if re.search(pattern, token):
            filtered_tokens.append(token)
            
    return filtered_tokens

### Splitting data into paragraphs, to be used as entities for k-means clustering (after vectorization using TF-IDF).

In [11]:
totalvocab_tokenized = []
for i in tweetdata:
    allwords_tokenized = tokenize(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [12]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_tokenized)

### TF-IDF

In [13]:
tfidf_vectorizer = TfidfVectorizer(min_df=3, stop_words='english',tokenizer=tokenize, ngram_range=(1,3))
tfidf_matrix = tfidf_vectorizer.fit_transform(tweetdata)

In [14]:
terms = tfidf_vectorizer.get_feature_names()

### We perform K-Means clustering with number of clusters as 5.

In [15]:
cluster = KMeans(n_clusters=5)
cluster.fit(tfidf_matrix)
clusters = cluster.labels_.tolist()

In [16]:
paragraphs = { 'state': statename, 'tweet': tweetdata, 'cluster': clusters}

frame = pd.DataFrame(paragraphs, index = [clusters] , columns = ['state', 'cluster'])

In [17]:
frame['cluster'].value_counts()

2    25
1     8
4     7
3     6
0     6
Name: cluster, dtype: int64

#### Output for number of clusters (k) = 5

In [18]:
from __future__ import print_function

print("Top terms per cluster:")
print()
order_centroids = cluster.cluster_centers_.argsort()[:, ::-1] 
clusterstate={}
for i in range(5):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :50]:
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d states:" % i, end='')
    clusterstate[i] = frame.ix[i]['state'].values.tolist()
    for title in frame.ix[i]['state'].values.tolist():
        print(' %s' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words: thing, thing, world, thing, world, world, video, candidate, video, camp, pander, poll, drone, mike_pence, lack, thing, camp, thing, camp, lack, mouth, yeah, likability, puppet, hurricane, strike, smugness, puppet, election, email, email, server, ad, campaign, agreement, time, lead, trump, economy, mess, server, email, plan, economy, plan, tailspin, plan, dealer, policy, extension,

Cluster 0 states: DE WA WY CO NV NM

Cluster 1 words: interview, hacker, make, hacker, hacker, make, difference, memo, memo, make, difference, difference, world, intersection, poll, president, plan, campaign, thing, head, spokesman, campaign, poll, economy, poll, h, thing, guy, indictment, night, economy, tailspin, plan, party, debate, moderator, moderator, plan, leader, interrupting, interrupting, night, college, interrupting, loving, moderator, commish, commish, commish, mercy,

Cluster 1 states: IL OH AZ MS DC NE NH SC

Cluster 2 words: campaign, indictment, plan, 

In [19]:
clusterstate

{0: ['DE', 'WA', 'WY', 'CO', 'NV', 'NM'],
 1: ['IL', 'OH', 'AZ', 'MS', 'DC', 'NE', 'NH', 'SC'],
 2: ['MO',
  'ND',
  'CA',
  'NJ',
  'IN',
  'CT',
  'RI',
  'OR',
  'HI',
  'FL',
  'AL',
  'KS',
  'ID',
  'NY',
  'NC',
  'TX',
  'PR',
  'IA',
  'AK',
  'ME',
  'KY',
  'LA',
  'VT',
  'PA',
  'UT'],
 3: ['SD', 'MD', 'AR', 'MN', 'MT', 'MA'],
 4: ['TN', 'GA', 'WI', 'MI', 'OK', 'VA', 'WV']}

### Representing Data in Tabular Format for Visualization using Tableau

In [20]:
clusterdata = {}
for keys in clusterstate:
    for items in clusterstate[keys]:
        clusterdata[items] = keys

In [21]:
clusterresults = pd.DataFrame([key,clusterdata[key]] for key in clusterdata)

In [22]:
clusterresults

Unnamed: 0,0,1
0,SC,1
1,TN,4
2,GA,4
3,MO,2
4,SD,3
5,ND,2
6,CA,2
7,NJ,2
8,CT,2
9,IN,2


In [23]:
clusterresults.to_csv('clusters.csv')