In [1]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Process Data

In [2]:
with open('/content/drive/MyDrive/Colab Notebooks/plot.txt','r',encoding = 'cp1252') as f:
    plots = f.read()

In [3]:
def remove_noise(text,stop_words = list(set(stopwords.words('english')))):
    tokens = word_tokenize(text)
    cleaned_tokens = []
    for token in tokens:
        token = re.sub('[^A-Za-z0-9]+', ' ',token)
        if(len(token)>1 and token.lower() not in stop_words):
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [4]:
# Import TfidfVectorizer class from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.75,min_df=0.1,max_features = 50,tokenizer = remove_noise)

# Use the .fit_transform() method on the list plots
tfidf_matrix = tfidf_vectorizer.fit_transform(plots)

In [5]:
tfidf_matrix

<250x50 sparse matrix of type '<class 'numpy.float64'>'
	with 3406 stored elements in Compressed Sparse Row format>

### KMean Clustering

In [7]:
import warnings
warnings.filterwarnings('ignore')
from scipy.cluster.vq import kmeans,vq

# Generate cluster centers through the kmeans function
num_clusters = 2
cluster_centers, distortion = kmeans(tfidf_matrix.todense(),num_clusters)

# Generate terms from the tfidf_vectorizer object
terms = tfidf_vectorizer.get_feature_names()

for i in range(num_clusters):
    # Sort the terms and print top 3 terms
    center_terms = dict(zip(terms, list(cluster_centers[i])))
    sorted_terms = sorted(center_terms, key=center_terms.get, reverse=True)
    print(f"Top 5 Terms of Cluster Center {i} ==> ",sorted_terms[:5],"\n")

Top 5 Terms of Cluster Center 0 ==>  ['father', 'back', 'one', 'tells', 'money'] 

Top 5 Terms of Cluster Center 1 ==>  ['police', 'man', 'killed', 'wife', 'young'] 



> It looks like Cluster 1 is related to some legal affairs and Cluster 0 is related to father son relations.