In [269]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd 

style.use('ggplot')

class K_Means:
    def __init__(self, k =3, tolerance = 0.0001, max_iterations = 500):
        self.k = k
        self.tolerance = tolerance
        self.max_iterations = max_iterations

    def fit(self, data):

        self.centroids = {}

        #initialize the centroids, the first 'k' elements in the dataset will be our initial centroids
        for i in range(self.k):
            self.centroids[i] = data[["N", "ya"]].values[i]

        #begin iterations
        for i in range(self.max_iterations):
            self.classes = {}
            for i in range(self.k):
                self.classes[i] = []

            #find the distance between the point and cluster; choose the nearest centroid
            i = 1
            for index, row in data.iterrows():
                #print("features", features)
                distances = [np.linalg.norm(row[["N", "ya"]].values - self.centroids[centroid]) for centroid in self.centroids]
                classification = distances.index(min(distances))
                self.classes[classification].append(row)
                i += 1

            previous = dict(self.centroids)

            #average the cluster datapoints to re-calculate the centroids
            for classification in self.classes:
                self.centroids[classification] = np.average([row[["N", "ya"]].values for row in self.classes[classification]], axis = 0)

            isOptimal = True

            for centroid in self.centroids:

                original_centroid = previous[centroid]
                curr = self.centroids[centroid]

                if len(original_centroid) > 0:
                    if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.tolerance:
                        isOptimal = False

            #break out of the main loop if the results are optimal, ie. the centroids don't change their positions much(more than our tolerance)
            if isOptimal:
                break

    def pred(self, data):
        distances = [np.linalg.norm(data[["N", "ya"]].values - self.centroids[centroid]) for centroid in self.centroids]
        classification = distances.index(min(distances))
        return classification


In [270]:
def pre_processing(df):
    
    stop_words = set(stopwords.words('english'))
    
    df['text'] = df['text'].str.lower().replace("[^A-Za-z\s]", "")
    
    #df['text'] = df['text'].apply(lambda x: [x if x not in stopwords.words('english')])

    #df['text'] = df['text'].apply(lambda x: [item for item in x if len(nltk.pos_tag(item)) > 0 and nltk.pos_tag(item)[0] in target_tags])
    
    return df
    

In [271]:
def pre_processing_noun_counts(df):
    
    txt = df['text'].apply(nltk.word_tokenize)
    count = txt.apply(lambda x: 10 * len([word for word,pos in nltk.pos_tag(x) if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')])) 
    df['N'] = count 
    

    count = txt.apply(lambda x: 10 * len([word for word,pos in nltk.pos_tag(x) if pos.startswith('J')]))
    df['ADJ'] = count 
    
    
    df["sentenceLength"] = len(txt)
     
    return df

In [272]:


def main():
    df = pd.read_csv("DigitalDemocracy/committee_utterances.tsv", sep="\t")

    df = df.iloc[2:4000]
    
    df = pre_processing(df)
    df = pre_processing_noun_counts(df)
    #print(df.head())
    #return
        
    #df["hi"] = df["text"].str.len()
    df["ya"] = df["text"].str.len()
    
    labels = df['c_name'].value_counts()
    print(labels)
    #print(df.head())
    
    #df = df[["hi", "ya"]]
    
    
    #dataset = df.astype(float).values.tolist()
    X = df #returns a numpy array
    km = K_Means(len(labels))
    km.fit(X)
    
     # Plotting starts here
    colors = 10*["r", "g", "c", "b", "k"]

    print(km.centroids)
    for centroid in km.centroids:
        plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], s = 130, marker = "x")

    for classification in km.classes:
        color = colors[classification]
        for features in km.classes[classification]:
            plt.scatter(features[0], features[1], color = color,s = 30)
            
            
    purity_total_rows = len(df)
    purity_max_sum = 0

    rows = {}
    sum_f1 = 0

    print('length', len(km.classes))
    for cluster in range(0, len(km.classes)):
        max_label = ""
        max_label_amt = 0
        row = []
        for label in labels.keys():
            items_per_label = len([x for x in km.classes[cluster] if x['c_name'] == label])
            print(items_per_label)
            row.append(items_per_label)
            if(items_per_label > max_label_amt):
                max_label_amt = items_per_label
                max_label = label
        
        #precision = max_label_amt / len(km.classes[cluster])
        #recall = max_label_amt / labels[max_label]
        purity_max_sum += max(row) #Add maximum label value present in cluster
        print(purity_max_sum)
        #f1 = (2 * precision * recall) / (precision + recall)
        
        #sum_f1 += f1
    
    #average_f1 = sum_f1/len(km.classes)

    
    #print("Average F1", average_f1) #F1 Avg Calculation
    print("Purity", purity_max_sum/purity_total_rows) #Purity Calculation
    
   
   

if __name__ == "__main__":
    main()

2       [limited, water, resources, are, all, challeng...
3       [and, we, 've, invited, expert, witnesses, ,, ...
4       [drought, is, certainly, a, critical, issue, f...
5       [so, ,, i, appreciate, you, being, here, today...
6       [with, that, ,, we, will, go, ahead, and, begi...
7               [no, ,, we, 'll, get, right, into, it, .]
8       [okay, ,, okay, ,, very, good, ., okay, ., i, ...
9       [good, morning, ,, and, thank, you, mr., peria...
10      [to, get, to, the, crux, of, this, discussion,...
11      [and, we, need, to, prepare, ,, better, prepar...
12      [it, 's, important, to, note, that, these, imp...
13      [and, i, 'm, sure, they, can, tell, a, much, b...
14      [doctor, lund, here, with, me, today, ,, with,...
15      [this, has, resulted, in, a, direct, cost, to,...
16      [there, 's, estimates, that, nearly, 30, milli...
17      [livestock, producers, alike, have, also, expe...
18      [and, are, the, sole, producer, of, 14, commod...
19      [so, ,

ZeroDivisionError: float division by zero