In [1]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import sklearn.cluster as sk_cluster
import sklearn.feature_extraction.text as sk_text
import sklearn.metrics as metrics
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage      

%matplotlib inline
pd.set_option('display.max_colwidth', -1)

In [2]:
cleaned_data = pd.read_csv('cleaned_data.csv')
clinton_trump_texts = pd.read_table('clinton_trump_user_classes.txt', names = ['UserID', 'Class'], sep='\t')

In [3]:
clinton_trump_texts_agg = cleaned_data.groupby('UserID')['Hash'].apply(" ".join)
clinton_trump_sklearn = pd.DataFrame({'User_id':clinton_trump_texts_agg.index,'All_hashtag':clinton_trump_texts_agg.values})

In [4]:
vectorizer = sk_text.TfidfVectorizer(stop_words='english',max_features=10, min_df=1, max_df=300)
matrix = vectorizer.fit_transform(clinton_trump_sklearn.All_hashtag.values)
tfidf = matrix.toarray()

In [5]:
df = pd.DataFrame(matrix.todense(),index = clinton_trump_texts_agg.index, columns = vectorizer.get_feature_names())

In [6]:
merged_df = pd.merge(df,clinton_trump_texts, on='UserID')

## K-means

In [None]:
%%time
kmeans = sk_cluster.KMeans(n_clusters=2, n_init=10, max_iter=300)
kmeans_label = kmeans.fit_predict(tfidf)

In [None]:
print ('\nPrinting cluster assignment:')
print(kmeans_label)

In [None]:
print('\n')
error = kmeans.inertia_       #SSE; Sum of squared distances of samples to their closest cluster center.
print ("The total error of the clustering is: ", error)

In [None]:
truth = np.array(merged_df.Class.values)
truth

In [None]:
confusion = metrics.confusion_matrix(truth, kmeans.labels_, labels=[0, 1])   # first row: 0 ; second row: 1
print(confusion)

In [None]:
metrics.precision_score(truth, kmeans.labels_, average='weighted') # weighted: the average precision of all clusters is returned


In [None]:
metrics.recall_score(truth, kmeans.labels_, average='weighted')  # weighted: the average recall of all clusters is returned


In [None]:
metrics.f1_score(truth, kmeans.labels_, average= 'weighted')     # weighted: the average f1 of all clusters is returned


## Agglomerative Clustering

### MAX based

In [7]:
%%time
ag = sk_cluster.AgglomerativeClustering(linkage = 'complete', n_clusters = 2) 

Wall time: 0 ns


In [11]:
ag_labels = ag.fit_predict(tfidf)

MemoryError: 

In [10]:
print ('\nPrinting cluseter assignment:')
ag_labels


Printing cluseter assignment:


NameError: name 'ag_labels' is not defined

In [None]:
ag = linkage(merged_df, method='complete')

In [None]:
dendrogram(ag)  

In [None]:
confusion = metrics.confusion_matrix(truth, ag.labels_, labels=[0, 1])   # first row: 0 ; second row: 1
print(confusion)

In [None]:
metrics.precision_score(true, ag1.labels_, average='weighted') # weighted: the average precision of all clusters is returned


In [None]:
metrics.recall_score(true, ag1.labels_, average='weighted')  # weighted: the average recall of all clusters is returned


In [None]:
metrics.f1_score(true, ag1.labels_, average= 'weighted')     # weighted: the average f1 of all clusters is returned


### SSE based

In [None]:
ag2 = sk_cluster.AgglomerativeClustering(linkage = 'ward', n_clusters = 2)   

In [None]:
ag_labels2 = ag2.fit_predict(merged_df)

In [None]:
print ('\nPrinting cluseter assignment:')
ag_labels2 

In [None]:
ag2 = linkage(merged_df, method='ward') 

In [None]:
dendrogram(ag2)  

In [None]:
confusion = metrics.confusion_matrix(true, ag2.labels_, labels=[0, 1])   # first row: 0 ; second row: 1
print(confusion)

In [None]:
metrics.precision_score(true, ag2.labels_, average='weighted') # weighted: the average precision of all clusters is returned


In [None]:
metrics.recall_score(true, ag2.labels_, average='weighted')  # weighted: the average recall of all clusters is returned


In [None]:
metrics.f1_score(true, ag2.labels_, average= 'weighted')     # weighted: the average f1 of all clusters is returned


## Task 1.2

### For k-means, look at the two centers (centroids) and print the top-30 hashtags/handles with the highest tfidf values.

In [None]:
print(matrix)