In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from torch import cosine_similarity
from preprocess import preprocessing
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS
from sklearn.decomposition import TruncatedSVD

In [24]:
def prep_code_to_publication_year():
    merged_file = 'data/merged_form.csv'
    df = pd.read_csv(merged_file)

    code_to_publication_year = {}
    for _, row in df.iterrows():
        story_code = row['Story Code']
        year = row['publication_year']
        code_to_publication_year[story_code] = year

    return code_to_publication_year

def prep_cluster_result():
    code_to_partitions, code_to_plain_text = preprocessing()
    print("-----Start calculating tf-idf score-----")
    corpus = [text for text in code_to_plain_text.values()]
    story_codes = [code for code in code_to_plain_text.keys()]
    vectorizer = TfidfVectorizer(
        max_df = 0.7,
        min_df = 0.3,
        ngram_range=(1,3)
    )

    tfidf_matrix = vectorizer.fit_transform(corpus)
    feature_words = vectorizer.get_feature_names()
    print(len(feature_words))
    print(tfidf_matrix.shape)
    dist = 1 - cosine_similarity(tfidf_matrix)

    num_clusters = 3
    km = KMeans(n_clusters = num_clusters)
    km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()
    
    code_to_publication_year = prep_code_to_publication_year()
    code_to_cluster = {}
    for story_code, cluster in zip(story_codes, clusters):
        code_to_cluster[story_code] = {'publication_year': int(code_to_publication_year[story_code]), 'group': cluster}

    with open("cluster_result.json", 'w') as file:
        json.dump(code_to_cluster, file)
    print("-----End dumping the cluster result json file.-----")

    print("Top terms per cluster:")

    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    clusters_top_words = []
    for i in range(num_clusters):
        print(f"Cluster {i} words:")
        top_words = [feature_words[ind] for ind in order_centroids[i, :10]]
        clusters_top_words.append(top_words)
    return tfidf_matrix, dist, clusters_top_words

In [25]:
_, _, clusters_top_words = prep_cluster_result()

-----Start doing preprocessing-----
These stories don't have a reveal border ['ASH01', 'CBSH10', 'LMSY03', 'MC03', 'GPM01', 'OSH05', 'TSOTR15', 'TEV02_02', 'TEV02_01']

These stories are not annotated in input_form.csv {'TEV02': 'TEV02 - Cassie Côté(1).txt', 'TCD03': 'TCD03 - Wen W..txt', 'TCD02': 'TCD02 - Wen W..txt', 'TCD01': 'TCD01 - Wen W..txt'}
Note, TEV02 are annotated, but seperated into TEV02_01 and TEV02_02 in input_form.csv.
However, in plain_texts, TEV02 are two identical files. Since they are identical and they don't have a reveval border sentence, here we exclude them as well.

These stories are not in data/plain_texts ['CKS21'] 

-----End Data Processing----- 
 
 
-----Start paritioning the story by reveal border sentence-----
These cases really do not have reveal border, due to typo or other reasons ['OMIC04', 'ASH09', 'OMIC03', 'CKS53', 'PVDS41', 'CBSH05'] 

The number of files is correct.
-----End partitioning and normalization-----

-----Start calculating tf-idf sco

In [26]:
clusters_top_words[0], clusters_top_words[1]

(['mrs',
  'lady',
  'miss',
  'of her',
  'wife',
  'herself',
  'girl',
  'to her',
  'that she',
  'husband'],
 ['machine',
  'thinking',
  'miss',
  'detective',
  'said the',
  'dr',
  'mrs',
  'girl',
  'the girl',
  'professor'])

In [27]:
clusters_top_words[2]

['doctor',
 'said he',
 'mrs',
 'miss',
 'cried',
 'upon the',
 'train',
 'dr',
 'father',
 'round']