In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("tfidf_matrix_content.csv")

k = 3

labels = {0: "Left", 1: "Center", 2: "Right"}

In [3]:
def cosine_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def assign_clusters(df, clusters):
    for c in clusters.values():
        c['points'].clear()
        
    for idx in range(df.shape[0]):
        dist = []
        
        curr_row = df.iloc[idx].to_numpy()
        
        for i in range(len(clusters)):
            dis = round(cosine_sim(curr_row, clusters[i]['center']), 2)
            dist.append(dis)
        best_idx = int(np.argmax(dist))
        clusters[best_idx]['points'].append(curr_row)

        print(f'Row {idx} was classified as {labels[best_idx]}')
    return clusters

def update_clusters(clusters):
    for i in range(k):
        points = np.array(clusters[i]['points'])
        if points.shape[0] > 0:
            new_center = points.mean(axis =0)
            clusters[i]['center'] = new_center

    return clusters

def pred_cluster(df, clusters):
    pred = []
    for i in range(df.shape[0]):
        dist = []
        for j in range(k):
            dist.append(cosine_sim(df.iloc[i],clusters[j]['center']))
        pred.append(np.argmin(dist))
    return pred

In [None]:
clusters = {}
ranges = [(0,47), (47,95), (95,134)]
  
for i, (start, end) in enumerate(ranges):
    sampled_row = df.iloc[start:end].sample(n=1, random_state=np.random.seed()).iloc[0]
    clusters[i] = {
        'center': sampled_row.values,
        'points': []
    }

itr = 1
while itr > 0:
    clusters = assign_clusters(df,clusters)
    clusters = update_clusters(clusters)
    pred = pred_cluster(df,clusters)
    itr -= 1

count0, count1, count2 = 0, 0, 0

#Garbage: Cluster centers do not accurately describe classes. 
#Example: What if a bunch of left articles are placed in the right cluster?
#Then, the right cluster is not accurate in describing right article classifications
for c in pred:
    if c == 0:
        count0 += 1
    if c == 1:
        count1 += 1
    if c == 2:
        count2 += 1

print(f'Left Classification Accuracy:   {round((pred[0:47].count(0) / 46) * 100, 1)}%   Total Counts: {count0}')
print(f'Center Classification Accuracy: {round((pred[47:95].count(1) / 47) * 100, 1)}%  Total Counts: {count1}')
print(f'Right Classification Accuracy:  {round((pred[95:134].count(2) / 38) * 100, 1)}%  Total Counts: {count2}')

Row 0 was classified as Left
Row 1 was classified as Left
Row 2 was classified as Right
Row 3 was classified as Right
Row 4 was classified as Right
Row 5 was classified as Right
Row 6 was classified as Left
Row 7 was classified as Left
Row 8 was classified as Right
Row 9 was classified as Right
Row 10 was classified as Right
Row 11 was classified as Left
Row 12 was classified as Right
Row 13 was classified as Right
Row 14 was classified as Right
Row 15 was classified as Center
Row 16 was classified as Right
Row 17 was classified as Center
Row 18 was classified as Left
Row 19 was classified as Left
Row 20 was classified as Center
Row 21 was classified as Center
Row 22 was classified as Left
Row 23 was classified as Center
Row 24 was classified as Right
Row 25 was classified as Right
Row 26 was classified as Left
Row 27 was classified as Left
Row 28 was classified as Right
Row 29 was classified as Left
Row 30 was classified as Center
Row 31 was classified as Left
Row 32 was classified as

Analysis:

In this part, I am performing K-means clustering on both the bag of words and TF-IDF vector models seperately. Each vector can be classified as left, center, or right leaning, with rows [0, 46] classified as left, [47, 95] classified as center, and [95, 134] classified as right. I begin the classification by initializing three cluster centers, with one center assigned to a random left-aligned vector, another as a random center-aligned vector, and the last as a random right-aligned vector. I then iterate through each row, calculate the cosine similarity between each row and each of the cluster centers, then group the row into the cluster with the highest similarity.

Issues:

The issue with this is the assumption that similar political orientations use the same kind of words. However, imagine a large number of right leaning articles have similar words to the vector chosen for the left cluster center and different words from the vector chosen for the right cluster center. In this case, there would be a large number of right leaning vectors in the left cluster. When the center of the cluster is recalculated, the recalculated center no longer describes a left leaning vector. Therefore, the bag of words and TF-IDF vector models are not an accurate measure for unsupervised clustering.