In [2]:
#Python Packages
import re
import string
import glob

#Data manipulation
import pandas as pd
import numpy as np

#Data visualisation
import matplotlib.pyplot as plt

#Warning
import warnings
warnings.filterwarnings("ignore")

#Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf

#Clustering
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

#Scoring
from sklearn.metrics import adjusted_rand_score

#NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#scipy
from scipy.cluster.vq import kmeans

In [3]:
pdf = pd.read_csv("../dataset/pdf_data_textract.tsv")

# Training PDF Data

In [4]:
#Cleaning data
pdf = pd.read_csv("../dataset/pdf_data_textract.tsv")
pdf.drop(["Unnamed: 0"],axis=1, inplace=True)
pdf = pdf["file_texts"].dropna()
pdf = pdf.replace(r"\\n", ' ', regex=True).str.replace('\d+', ' ')

In [41]:
pdf[0]

'b\'Mary Hare Report of the Board of Gouernors and Accourits^\\xe \\x \\x  for the Year Endec{] ~Juiy    S&cAir\\\'m^.\\\'hr\\\\& f\\\\rhire> of d&sf Mlcir&n and tjotm^. people-  \\x cCONTENTS Page  Reference and administrative details of the charity, its trustees and advisers     Report of the Board of Governors   -   Independent auditor\\\'s report   -   Consolidated statement of financial activities     Consolidated balance sheet     Company balance sheet     Consolidated cash flow statement     Notes to the financial statements   -   Special note - Martin Mays-Smith (  -  ) Martin Mays-Smith was a funny, generous, thoughtful, wise and caring man who selflessly and tirelessly put his great talent and experience at the service of Mary Hare as a governor for many years. As Chair of Finance, his extraordinary ability to combine mischievous asides with a gimlet eye for detail and consequences was a remarkable force for stability and progress. Above all, Martin cared - he cared for the s

# Test PDF Data

In [5]:
test = pd.read_csv("../dataset/pdf_test.tsv")
test.drop(["Unnamed: 0"],axis=1, inplace=True)
test = test["file_texts"].dropna()
test = test.replace(r"\\n", ' ', regex=True).str.replace('\d+', ' ')

# Stopwords

In [6]:
def clean_docs(doc):
    stop = stopwords.words("english")

In [7]:
def remove_noise(text, stop_words=[]):
    tokens = word_tokenize(text)
    cleaned_tokens = []
    for token in tokens:
        re.sub("[^A-Za-z0-9]+","",token)
        if len(token) > 1 and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [8]:
def remove_stops(text,stops):
    text = re.sub(r"[^A-Za-z0-9]+","",text)
    words = text.split()
    final = []
    for word in words:
        final.append(word)
    final = " ".join(final)
    final = final.translate(str.maketrans("","",string.punctuation))
    final = "".join([i for i in final if not i.isdigit()])
    while "  " in final:
        final = final.replace("  "," ")

# Bag of words

In [None]:
def tokenize_all()

In [51]:

    
corpus = nltk.sent_tokenize(pdf[0])

corpuses = [nltk.sent_tokenize(text) for text in pdf]

3408

In [40]:
for i in range(len(corpus)):
    corpus [i] = corpus [i].lower()
    corpus [i] = re.sub(r'\W',' ',corpus [i])
    corpus [i] = re.sub(r'\s+',' ',corpus [i])
    print(corpus[i])
print(len(corpus))

b mary hare report of the board of gouernors and accourits xe x x for the year endec juiy s cair m hr f rhire of d sf mlcir n and tjotm 
people x ccontents page reference and administrative details of the charity its trustees and advisers report of the board of governors independent auditor s report consolidated statement of financial activities consolidated balance sheet company balance sheet consolidated cash flow statement notes to the financial statements special note martin mays smith martin mays smith was a funny generous thoughtful wise and caring man who selflessly and tirelessly put his great talent and experience at the service of mary hare as a governor for many years 
as chair of finance his extraordinary ability to combine mischievous asides with a gimlet eye for detail and consequences was a remarkable force for stability and progress 
above all martin cared he cared for the school he cared deeply for the young people we are here to help and he cared for the families who 

In [45]:
wordfreq = {}
for sentence in corpus:
    tokens = nltk.word_tokenize(sentence)
    for token in tokens:
        if len(token) > 3:
            if token not in wordfreq.keys():
                wordfreq[token] = 1
            else:
                wordfreq[token] += 1
            

In [48]:
wordfreq

{'mary': 114,
 'hare': 145,
 'report': 27,
 'board': 12,
 'gouernors': 1,
 'accourits': 1,
 'year': 111,
 'endec': 1,
 'juiy': 1,
 'cair': 1,
 'rhire': 1,
 'mlcir': 1,
 'tjotm': 1,
 'people': 3,
 'ccontents': 1,
 'page': 36,
 'reference': 4,
 'administrative': 4,
 'details': 9,
 'charity': 17,
 'trustees': 9,
 'advisers': 4,
 'governors': 36,
 'independent': 6,
 'auditor': 9,
 'consolidated': 6,
 'statement': 20,
 'financial': 67,
 'activities': 38,
 'balance': 7,
 'sheet': 7,
 'company': 47,
 'cash': 19,
 'flow': 5,
 'notes': 26,
 'statements': 47,
 'special': 6,
 'note': 12,
 'martin': 4,
 'mays': 5,
 'smith': 5,
 'funny': 1,
 'generous': 1,
 'thoughtful': 1,
 'wise': 1,
 'caring': 1,
 'selflessly': 1,
 'tirelessly': 1,
 'great': 2,
 'talent': 1,
 'experience': 5,
 'service': 13,
 'governor': 6,
 'many': 1,
 'years': 12,
 'chair': 1,
 'finance': 8,
 'extraordinary': 1,
 'ability': 2,
 'combine': 1,
 'mischievous': 1,
 'asides': 1,
 'with': 58,
 'gimlet': 1,
 'detail': 1,
 'consequenc

# Vectorizer

In [9]:
vectorizer = tfidf(analyzer="word",
                   stop_words="english",
                   lowercase=True,
                   min_df=0.2, 
                   max_df=0.9, 
                   use_idf=True, 
                   smooth_idf=True,
                   tokenizer=remove_noise)

In [10]:
pdf_matrix = vectorizer.fit_transform(pdf)
X = pdf_matrix.todense()
X_list = X.tolist()
words = vectorizer.get_feature_names()
train_df = pd.DataFrame(X_list, columns=words)

In [11]:
all_keywords = []
for description in X_list:
    x=0
    keywords = []
    for word in description:
        if word > 0:
            keywords.append(words[x])
        x += 1
    all_keywords.append(keywords)

In [12]:
test_vectorizer = tfidf(analyzer="word",
                   stop_words="english", 
                   min_df=0, 
                   max_df=1, 
                   use_idf=True, 
                   smooth_idf=True)

In [13]:
pdf_test = test_vectorizer.fit_transform(test)
X_test = pdf_test.todense()
X_test_list = X_test.tolist()
test_terms = test_vectorizer.get_feature_names()
test_df = pd.DataFrame(X_test_list, test_terms)

# KMeans Tuning

In [14]:
#hyperparameter tuning
n_clusters = 5
n_iters = 100
n_seed = 10

In [15]:
kmeans = KMeans(n_clusters=n_clusters,
               init="k-means++",
               max_iter=n_iters,
               n_init=1, 
               precompute_distances="auto",
               n_jobs=1)

In [16]:
kmeans.fit(train_df)

KMeans(max_iter=100, n_clusters=5, n_init=1, n_jobs=1,
       precompute_distances='auto')

In [17]:
centroids = kmeans.cluster_centers_.argsort()[:,::-1]

In [18]:
for i in range(n_clusters):
    print(f"Cluster {i + 1}:")
    for ind in centroids[i,:10]:
        print(f"{words[ind]}")

Cluster 1:
college
\xa
\xc
scheme
\xe
statements
assets
's
total
pension
Cluster 2:
church
\xe
\xa
accounts
\xc
's
st
total
information
community
Cluster 3:
c\x
\xa
school
church
\xe
accounts
\xc
examination
time
charity\xe
Cluster 4:
\xa
\xc
\xe
statements
total
company
ended
assets
accounts
costs
Cluster 5:
statements
's
company
total
ended
assets
costs
accounting
march
charitable


# Modeling

In [19]:
k_mean_indices = kmeans.fit_transform(pdf_matrix.toarray())

In [20]:
scatter_plot_points = PCA(n_components = pca_num_components).fit_transform(X)

NameError: name 'pca_num_components' is not defined

In [None]:
colors = ["r","b","y","c","m"]

In [None]:
x_axis = [p[0] for p in scatter_plot_points]
y_axis = [p[1] for p in scatter_plot_points]

In [None]:
fig, ax = plt.subplots(figsize=(20,20))
ax.scatter(x_axis,y_axis)

# Prediction

In [None]:
test_label = kmeans.fit_predict(pdf_test)
print(test_label)
print(set(test_label))

In [65]:
train_labels = kmeans.fit_predict(train_df)
print(train_labels)
print(set(train_labels))

[1 3 2 ... 2 4 3]
{0, 1, 2, 3, 4}
