In [41]:
import warnings
warnings.filterwarnings('ignore')

from scipy.spatial.distance import cosine
import pandas as pd

tfidf_vectors = pd.read_excel('tfidf_data.xlsx',index_col=0)

tfidf_vectors = tfidf_vectors.reindex(sorted(tfidf_vectors.columns), axis=1)

In [42]:
#add centroids for each cluster (6 topics = 6 centroids)
tfidf_vectors['sports_centroid'] = tfidf_vectors['9901_sports.txt-tfidf']
tfidf_vectors['food_centroid'] = tfidf_vectors['9902_food.txt-tfidf']
tfidf_vectors['tech_centroid'] = tfidf_vectors['9903_tech.txt-tfidf']
tfidf_vectors['science_centroid'] = tfidf_vectors['9904_science.txt-tfidf']
tfidf_vectors['business_centroid'] = tfidf_vectors['9905_business.txt-tfidf']
tfidf_vectors['politics_centroid'] = tfidf_vectors['9906_politics.txt-tfidf']

#create dataframe for distance matrix
distance_matrix = pd.DataFrame(columns=['sports_centroid','food_centroid','tech_centroid','science_centroid','business_centroid','politics_centroid'])

#caclulate distance between each vector and each centroid
for column in tfidf_vectors.columns:
    if 'tfidf' in str(column):
        idx = str(column)
        sports_dist = round(cosine(tfidf_vectors[column], tfidf_vectors['sports_centroid']), 10)
        food_dist = round(cosine(tfidf_vectors[column], tfidf_vectors['food_centroid']), 10)
        tech_dist = round(cosine(tfidf_vectors[column], tfidf_vectors['tech_centroid']), 10)
        science_dist = round(cosine(tfidf_vectors[column], tfidf_vectors['science_centroid']), 10)
        business_dist = round(cosine(tfidf_vectors[column], tfidf_vectors['business_centroid']), 10)
        politics_dist = round(cosine(tfidf_vectors[column], tfidf_vectors['politics_centroid']), 10)
        
        distance_matrix.loc[idx] = [sports_dist, food_dist, tech_dist, science_dist, business_dist, politics_dist]

#print for debugging
# distance_matrix.idxmin(axis=1)
# distance_matrix
# tfidf_vectors

Unnamed: 0,sports_centroid,food_centroid,tech_centroid,science_centroid,business_centroid,politics_centroid
0101_sports.txt-tfidf,0.978032,0.988298,0.978044,0.986262,0.992155,0.974214
0102_sports.txt-tfidf,0.979647,0.987683,0.985832,0.993829,0.992554,0.984454
0103_sports.txt-tfidf,0.971782,0.997671,0.988554,0.997469,0.985469,0.982265
0104_sports.txt-tfidf,0.976885,0.988731,0.984712,0.997317,0.993665,0.986285
0105_food.txt-tfidf,0.971328,0.986045,0.987628,0.987135,0.988805,0.986240
...,...,...,...,...,...,...
9902_food.txt-tfidf,0.996915,0.000000,0.987351,0.996724,0.992254,0.974793
9903_tech.txt-tfidf,0.995198,0.987351,0.000000,0.992853,0.982680,0.968343
9904_science.txt-tfidf,0.994479,0.996724,0.992853,0.000000,0.982778,0.988074
9905_business.txt-tfidf,0.984745,0.992254,0.982680,0.982778,0.000000,0.986981


In [43]:
#create clusters
sports_cluster = []
food_cluster = []
tech_cluster = []
science_cluster = []
business_cluster = []
politics_cluster = []

#add documents to clusters
for doc in distance_matrix.index:
    centroid = distance_matrix.loc[doc].idxmin()
    
    if 'sports' in str(centroid):
        sports_cluster.append(str(doc))
    if 'food' in str(centroid):
        food_cluster.append(str(doc))
    if 'tech' in str(centroid):
        tech_cluster.append(str(doc))
    if 'science' in str(centroid):
        science_cluster.append(str(doc))
    if 'business' in str(centroid):
        business_cluster.append(str(doc))
    if 'politics' in str(centroid):
        politics_cluster.append(str(doc))
        

In [44]:
# print for debugging 
# sports_cluster
# food_cluster
# tech_cluster
# science_cluster
# business_cluster
# politics_cluster
# distance_matrix


['0102_sports.txt-tfidf',
 '0103_sports.txt-tfidf',
 '0104_sports.txt-tfidf',
 '0105_food.txt-tfidf',
 '0106_sports.txt-tfidf',
 '0107_sports.txt-tfidf',
 '0210_sports.txt-tfidf',
 '0504_business.txt-tfidf',
 '9901_sports.txt-tfidf']

In [45]:
#recalculate centroids 
tfidf_vectors['sports_centroid'] = tfidf_vectors[sports_cluster].sum(axis=1)
tfidf_vectors['food_centroid'] = tfidf_vectors[food_cluster].sum(axis=1)
tfidf_vectors['tech_centroid'] = tfidf_vectors[tech_cluster].sum(axis=1)
tfidf_vectors['science_centroid'] = tfidf_vectors[science_cluster].sum(axis=1)
tfidf_vectors['business_centroid'] = tfidf_vectors[business_cluster].sum(axis=1)
tfidf_vectors['politics_centroid'] = tfidf_vectors[politics_cluster].sum(axis=1)

#print for debugging
# tfidf_vectors

Unnamed: 0,0101_sports.txt-tfidf,0102_sports.txt-tfidf,0103_sports.txt-tfidf,0104_sports.txt-tfidf,0105_food.txt-tfidf,0106_sports.txt-tfidf,0107_sports.txt-tfidf,0108_sports.txt-tfidf,0109_sports.txt-tfidf,0110_food.txt-tfidf,...,9904_science.txt-tfidf,9905_business.txt-tfidf,9906_politics.txt-tfidf,idf,sports_centroid,food_centroid,tech_centroid,science_centroid,business_centroid,politics_centroid
aapl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,21.74766,0.0,0.00000,21.74766,0.0,0.00000,0.00000
aaron,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,38.12397,0.0,0.00000,25.41598,0.0,0.00000,12.70799
ab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,21.74766,0.0,0.00000,0.00000,0.0,0.00000,21.74766
abalone,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,21.74766,0.0,21.74766,0.00000,0.0,0.00000,0.00000
abandon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,21.74766,0.0,0.00000,21.74766,0.0,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zoom,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,21.74766,0.0,0.00000,0.00000,0.0,0.00000,21.74766
zucchini,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,21.74766,0.0,21.74766,0.00000,0.0,0.00000,0.00000
zuckerberg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.32692,0.0,46.63460,0.0,0.00000,37.30768,0.0,9.32692,0.00000
zuckerbergfacebookvirtual,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,21.74766,0.0,0.00000,21.74766,0.0,0.00000,0.00000


In [62]:
#drop all columns except centroids
for column in tfidf_vectors.columns:
    if 'centroid' not in column:
        tfidf_vectors.drop(column, axis=1, inplace=True)
tfidf_vectors.to_excel('centroids.xlsx')

In [46]:
#second iteration
distance_matrix = pd.DataFrame(columns=['sports_centroid','food_centroid','tech_centroid','science_centroid','business_centroid','politics_centroid'])

#recalculate distance between each vector and each centroid
for column in tfidf_vectors.columns:
    if 'tfidf' in str(column):
        idx = str(column)
        sports_dist = round(cosine(tfidf_vectors[column], tfidf_vectors['sports_centroid']),10)
        food_dist = round(cosine(tfidf_vectors[column], tfidf_vectors['food_centroid']),10)
        tech_dist = round(cosine(tfidf_vectors[column], tfidf_vectors['tech_centroid']),10)
        science_dist = round(cosine(tfidf_vectors[column], tfidf_vectors['science_centroid']),10)
        business_dist = round(cosine(tfidf_vectors[column], tfidf_vectors['business_centroid']),10)
        politics_dist = round(cosine(tfidf_vectors[column], tfidf_vectors['politics_centroid']),10)
        
        distance_matrix.loc[idx] = [sports_dist, food_dist, tech_dist, science_dist, business_dist, politics_dist]

#print for debugging        
# distance_matrix

Unnamed: 0,sports_centroid,food_centroid,tech_centroid,science_centroid,business_centroid,politics_centroid
0101_sports.txt-tfidf,0.900068,0.946695,0.899452,0.952950,0.968958,0.604966
0102_sports.txt-tfidf,0.714919,0.958793,0.938940,0.977811,0.982997,0.931988
0103_sports.txt-tfidf,0.682399,0.963886,0.945361,0.982056,0.968777,0.937643
0104_sports.txt-tfidf,0.630783,0.959594,0.928944,0.977831,0.975610,0.932573
0105_food.txt-tfidf,0.503312,0.916600,0.916966,0.957048,0.962911,0.924593
...,...,...,...,...,...,...
9902_food.txt-tfidf,0.979249,0.742870,0.951226,0.973771,0.982942,0.956508
9903_tech.txt-tfidf,0.970018,0.964304,0.760536,0.965264,0.971760,0.928194
9904_science.txt-tfidf,0.983491,0.975888,0.970338,0.720610,0.981761,0.969391
9905_business.txt-tfidf,0.974263,0.975262,0.955841,0.985615,0.599003,0.966623


In [47]:
#recreate clusters
sports_cluster = []
food_cluster = []
tech_cluster = []
science_cluster = []
business_cluster = []
politics_cluster = []

#add documents to clusters
for doc in distance_matrix.index:
    centroid = distance_matrix.loc[doc].idxmin()
    
    if 'sports' in str(centroid):
        sports_cluster.append(str(doc))
    if 'food' in str(centroid):
        food_cluster.append(str(doc))
    if 'tech' in str(centroid):
        tech_cluster.append(str(doc))
    if 'science' in str(centroid):
        science_cluster.append(str(doc))
    if 'business' in str(centroid):
        business_cluster.append(str(doc))
    if 'politics' in str(centroid):
        politics_cluster.append(str(doc))

In [48]:
# sports_cluster
# food_cluster
# tech_cluster
# science_cluster
# business_cluster
# politics_cluster

['0102_sports.txt-tfidf',
 '0103_sports.txt-tfidf',
 '0104_sports.txt-tfidf',
 '0105_food.txt-tfidf',
 '0106_sports.txt-tfidf',
 '0107_sports.txt-tfidf',
 '0210_sports.txt-tfidf',
 '0504_business.txt-tfidf',
 '9901_sports.txt-tfidf']

In [60]:
#create dataframe for clusters
sports_df=pd.DataFrame(sports_cluster,columns=['sports_cluster'])
food_df=pd.DataFrame(food_cluster,columns=['food_cluster'])
tech_df=pd.DataFrame(tech_cluster,columns=['tech_cluster'])
science_df=pd.DataFrame(science_cluster,columns=['science_cluster'])
business_df=pd.DataFrame(business_cluster,columns=['business_cluster'])
politics_df=pd.DataFrame(politics_cluster,columns=['politics_cluster'])

#combine all clusters into one dataframe and export to excel
all_df = pd.concat([sports_df,food_df,tech_df,science_df,business_df,politics_df],axis=1)
all_df.to_excel('clusters.xlsx')