In [1]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import sklearn.cluster as sk_cluster
import sklearn.feature_extraction.text as sk_text
import sklearn.metrics as metrics
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage      

%matplotlib inline
pd.set_option('display.max_colwidth', -1)

In [2]:
cleaned_data = pd.read_csv('cleaned_data.csv')
clinton_trump_texts = pd.read_table('clinton_trump_user_classes.txt', names = ['UserID', 'Class'], sep='\t')

In [3]:
clinton_trump_texts_agg = cleaned_data.groupby('UserID')['Hash'].apply(" ".join)
clinton_trump_sklearn = pd.DataFrame({'User_id':clinton_trump_texts_agg.index,'All_hashtag':clinton_trump_texts_agg.values})

In [4]:
vectorizer = sk_text.TfidfVectorizer(stop_words='english',max_features=10, min_df=1, max_df=300)
matrix = vectorizer.fit_transform(clinton_trump_sklearn.All_hashtag.values)
tfidf = matrix.toarray()

In [5]:
df = pd.DataFrame(matrix.todense(),index = clinton_trump_texts_agg.index, columns = vectorizer.get_feature_names())

In [6]:
merged_df = pd.merge(df,clinton_trump_texts, on='UserID')

## K-means

In [7]:
%%time
kmeans = sk_cluster.KMeans(n_clusters=2, n_init=10, max_iter=300)
kmeans_label = kmeans.fit_predict(tfidf)

CPU times: user 110 ms, sys: 11.5 ms, total: 122 ms
Wall time: 72.6 ms


In [8]:
print ('\nPrinting cluster assignment:')
print(kmeans_label)


Printing cluster assignment:
[0 0 0 ... 0 0 0]


In [9]:
print('\n')
error = kmeans.inertia_       #SSE; Sum of squared distances of samples to their closest cluster center.
print ("The total error of the clustering is: ", error)



The total error of the clustering is:  316.4374802419599


In [10]:
truth = np.array(merged_df.Class.values)
truth

array([1, 0, 0, ..., 1, 1, 1])

In [11]:
confusion = metrics.confusion_matrix(truth, kmeans.labels_, labels=[0, 1])   # first row: 0 ; second row: 1
print(confusion)

[[4516   88]
 [4216  101]]


In [12]:
metrics.precision_score(truth, kmeans.labels_, average='weighted') # weighted: the average precision of all clusters is returned


0.5255079771788584

In [13]:
metrics.recall_score(truth, kmeans.labels_, average='weighted')  # weighted: the average recall of all clusters is returned


0.5175428763591525

In [14]:
metrics.f1_score(truth, kmeans.labels_, average= 'weighted')     # weighted: the average f1 of all clusters is returned


0.37121996096205084

## Agglomerative Clustering

### MAX based

In [24]:
%%time
ag1 = sk_cluster.AgglomerativeClustering(linkage = 'complete', n_clusters = 2) 

CPU times: user 26 µs, sys: 58 µs, total: 84 µs
Wall time: 90.1 µs


In [25]:
ag_labels = ag1.fit_predict(tfidf)

In [26]:
print ('\nPrinting cluseter assignment:')
ag_labels


Printing cluseter assignment:


array([0, 0, 0, ..., 0, 0, 0])

In [27]:
confusion = metrics.confusion_matrix(truth, ag1.labels_, labels=[0, 1])   # first row: 0 ; second row: 1
print(confusion)

[[4517   87]
 [4219   98]]


In [29]:
metrics.precision_score(truth, ag1.labels_, average='weighted') # weighted: the average precision of all clusters is returned


0.5231889279669798

In [30]:
metrics.recall_score(truth, ag1.labels_, average='weighted')  # weighted: the average recall of all clusters is returned


0.5173186862459366

In [31]:
metrics.f1_score(truth, ag1.labels_, average= 'weighted')     # weighted: the average f1 of all clusters is returned


0.37056687156553664

### SSE based

In [36]:
ag2 = sk_cluster.AgglomerativeClustering(linkage = 'ward', n_clusters = 2)   

In [37]:
ag_labels2 = ag2.fit_predict(merged_df)

In [38]:
print ('\nPrinting cluseter assignment:')
ag_labels2 


Printing cluseter assignment:


array([1, 1, 1, ..., 0, 0, 0])

In [40]:
confusion = metrics.confusion_matrix(truth, ag2.labels_, labels=[0, 1])   # first row: 0 ; second row: 1
print(confusion)

[[2220 2384]
 [1291 3026]]


In [42]:
metrics.precision_score(truth, ag2.labels_, average='weighted') # weighted: the average precision of all clusters is returned


0.5969901995123185

In [43]:
metrics.recall_score(truth, ag2.labels_, average='weighted')  # weighted: the average recall of all clusters is returned


0.5880506669655868

In [44]:
metrics.f1_score(truth, ag2.labels_, average= 'weighted')     # weighted: the average f1 of all clusters is returned


0.583453062904208

## Task 1.2

### For k-means, look at the two centers (centroids) and print the top-30 hashtags/handles with the highest tfidf values.