In [None]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import sklearn.cluster as sk_cluster
import sklearn.feature_extraction.text as sk_text
import sklearn.metrics as metrics
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage      

%matplotlib inline
pd.set_option('display.max_colwidth', -1)

In [None]:
cleaned_data = pd.read_csv('cleaned_data.csv')
clinton_trump_texts = pd.read_table('clinton_trump_user_classes.txt', names = ['UserID', 'Class'], sep='\t')

In [None]:
clinton_trump_texts_agg = cleaned_data.groupby('UserID')['Hash'].apply(" ".join)
clinton_trump_sklearn = pd.DataFrame({'User_id':clinton_trump_texts_agg.index,'All_hashtag':clinton_trump_texts_agg.values})

In [None]:
vectorizer = sk_text.TfidfVectorizer(stop_words='english',max_features=4, min_df=1, max_df=300)
matrix = vectorizer.fit_transform(clinton_trump_sklearn.All_hashtag.values)
tfidf = matrix.toarray()

In [None]:
df = pd.DataFrame(matrix.todense(),index = clinton_trump_texts_agg.index, columns = vectorizer.get_feature_names())

In [None]:
merged_df = pd.merge(df,clinton_trump_texts, on='UserID')

## Task 1.1

### Let’s apply clustering and compare the clustering result against a known ground truth. In the file “clinton_trump_user_classes.txt”, we have the ground truth “class” membership for each user id in the data. Class 0 corresponds to Trump followers, while class 1 corresponds to Clinton followers.

### Run the k-means algorithm (K=2) and the two different variations of the agglomerative clustering algorithm (MAX-based and SSE-based).

### Compute the confusion matrix, precision, recall, and F-measure for (1) k-means, (2) MAX-based agglomerative clustering, and (3) SSE-based agglomerative clustering. Compare their performance and include your conclusions in your report. See sample code in lab 4.

## K-means

In [None]:
%%time
kmeans = sk_cluster.KMeans(n_clusters=2, n_init=10, max_iter=300)
kmeans_label = kmeans.fit_predict(tfidf)

In [None]:
print ('\nPrinting cluster assignment:')
print(kmeans_label)

In [None]:
print('\n')
error = kmeans.inertia_       #SSE; Sum of squared distances of samples to their closest cluster center.
print ("The total error of the clustering is: ", error)

In [None]:
truth = np.array(merged_df.Class.values)
truth

In [None]:
confusion = metrics.confusion_matrix(truth, kmeans.labels_, labels=[0, 1])   # first row: 0 ; second row: 1
print(confusion)
print(metrics.precision_score(truth, kmeans.labels_, average='weighted'))
print(metrics.recall_score(truth, kmeans.labels_, average='weighted'))
print(metrics.f1_score(truth, kmeans.labels_, average= 'weighted'))

## Agglomerative Clustering

### MAX based

In [None]:
%%time
ag1 = sk_cluster.AgglomerativeClustering(linkage = 'complete', n_clusters = 2) 

In [None]:
ag_labels = ag1.fit_predict(tfidf)

In [None]:
print ('\nPrinting cluseter assignment:')
ag_labels

In [None]:
confusion = metrics.confusion_matrix(truth, ag1.labels_, labels=[0, 1])   # first row: 0 ; second row: 1
print(confusion)
print(metrics.precision_score(truth, ag1.labels_, average='weighted'))
print(metrics.recall_score(truth, ag1.labels_, average='weighted'))
print(metrics.f1_score(truth, ag1.labels_, average= 'weighted'))

### SSE based

In [None]:
ag2 = sk_cluster.AgglomerativeClustering(linkage = 'ward', n_clusters = 2)   

In [None]:
ag_labels2 = ag2.fit_predict(merged_df)

In [None]:
print ('\nPrinting cluseter assignment:')
ag_labels2 

In [None]:
confusion = metrics.confusion_matrix(truth, ag2.labels_, labels=[0, 1])   # first row: 0 ; second row: 1
print(confusion)
print(metrics.precision_score(truth, ag2.labels_, average='weighted'))
print(metrics.recall_score(truth, ag2.labels_, average='weighted'))
print(metrics.f1_score(truth, ag2.labels_, average= 'weighted'))

## Task 1.2

### For k-means, look at the two centers (centroids) and print the top-30 hashtags/handles with the highest tfidf values.

In [None]:
centroids = kmeans.cluster_centers_
print ('Cluster Centroids')
print(centroids)

In [None]:
asc_order_centroids = kmeans.cluster_centers_.argsort()    # argsort() returns the indices that would sort an array.
des_order_centroids = asc_order_centroids[:,::-1] 

In [None]:
terms = vectorizer.get_feature_names()
print ("All the terms:")
print(terms, '\n')

for i in range(2):
    print ("Cluster:", i)
    for ind in des_order_centroids[i, :30]:
        print (terms[ind])
    print()  

In [None]:
asc_order_centroids

## Task 1.3

### Show the two respective word clouds of the two centers (centroids) by using hashtags/handles and their tfidf values. Hint: Use function fit_words() that comes with wordcloud

# 2. Clustering of Hashtags/handles

## Task 2.1

### First, you apply the k-means algorithm. Create a plot of the SSE error of the k-means algorithm as a function of the number of clusters, for k up to 20, in order to determine the optimal number of clusters.

In [None]:
error = np.zeros(4)
for k in range(1,4):
    kmeans = sk_cluster.KMeans(n_clusters=k, n_init=10, max_iter=500)
    kmeans.fit_predict(tfidf.transpose())
    error[k] = kmeans.inertia_

plt.plot(range(1, 4), error[1:])
plt.xlabel('Number of clusters')
plt.ylabel('SSE')

## Task 2.2

### Run the k-means algorithm for the optimal number of clusters you identified in the last task. Print some hashtags/handles in each cluster. From the hashtags/handles in each cluster, try to deduce what is the topic it concerns. Include your conclusions in your report.

In [None]:
kmeans = sk_cluster.KMeans(n_clusters=6, n_init=10, max_iter=300)
kmeans_label = kmeans.fit_predict(tfidf)

In [None]:
print ('\nPrinting cluster assignment:')
print(kmeans_label)

In [None]:
print('\n')
error = kmeans.inertia_       #SSE; Sum of squared distances of samples to their closest cluster center.
print ("The total error of the clustering is: ", error)

In [None]:
truth = np.array(merged_df.Class.values)
truth

In [None]:
confusion = metrics.confusion_matrix(truth, kmeans.labels_, labels=[0, 1])   # first row: 0 ; second row: 1
print(confusion)
print(metrics.precision_score(truth, kmeans.labels_, average='weighted')) # weighted: the average precision of all clusters is returned
print(metrics.recall_score(truth, ag2.labels_, average='weighted'))  # weighted: the average recall of all clusters is returned
print(metrics.f1_score(truth, ag2.labels_, average= 'weighted'))     # weighted: the average f1 of all clusters is returned
