### Importing packages

In [44]:
import json
import string
import pandas as pd
import os
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import PlaintextCorpusReader, stopwords
from nltk import download as download_nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
download_nltk("punkt_tab")
download_nltk("stopwords")

DEBUG = False

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\s-A013-16\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\s-A013-16\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Creating documents corpus

In [45]:
corpus_dir = "./Literature-original"
corpus = PlaintextCorpusReader(corpus_dir, ".*\.txt")
file_names = corpus.fileids()

### Corpus documents preprocessing

In [46]:
documents = {}
for file_name in file_names:
    documents[file_name] = corpus.raw(file_name)

In [47]:
lengths = {}
for file_name in documents:
    lengths[file_name] = {
        "pre": len(word_tokenize(documents[file_name]))
    }

In [48]:
ps = PorterStemmer()

In [49]:
for file_name in documents:
    documents[file_name] = documents[file_name].lower()
    documents[file_name] = "".join((char for char in documents[file_name] if char not in string.punctuation))
    documents[file_name] = "".join((char for char in documents[file_name] if not char.isdigit()))
    documents[file_name] = " ".join(ps.stem(word) for word in word_tokenize(documents[file_name]))
    documents[file_name] = " ".join((word for word in word_tokenize(documents[file_name]) if word not in list (stopwords.words('english'))))

In [50]:
print(json.dumps(documents, indent=4, ensure_ascii=False))

{
    "Chronicles of Narnia. Prince Caspian.txt": "peter susan edmund luci pevensi magic whisk away british railway station beach near old ruin castl determin ruin cair paravel onc rule king queen narnia discov treasur vault peter sword shield susan bow arrow luci dagger bottl magic cordial store susan horn summon help miss left wood day return england prior visit narnia although onli year ha pass england year pass narniaa children rescu trumpkin dwarf soldier drown trumpkin tell children narnia histori sinc disappear telmarin conquer narnia rule king miraz hi wife queen prunaprismia miraz usurp throne kill hi brother king caspian ix father princ caspian miraz toler right heir princ caspian hi son wa born caspian escap miraz castl aid hi tutor doctor corneliu school lore old narnia gave queen susan horn caspian fled forest wa knock unconsci hi hors bolt awok den talk badger trufflehunt two dwarf nikabrik trumpkin accept caspian king badger dwarv took caspian meet mani creatur old narni

In [51]:
for file_name in documents:
    lengths[file_name]['post'] = len(word_tokenize(documents[file_name]))

In [52]:
lengths = pd.DataFrame.from_dict(lengths, orient="index")

In [53]:
lengths['diff'] = lengths['pre']-lengths['post']
lengths['pct'] = lengths['diff']/lengths['pre']
lengths

Unnamed: 0,pre,post,diff,pct
Chronicles of Narnia. Prince Caspian.txt,657,339,318,0.484018
Chronicles of Narnia. The Horse and His Boy.txt,850,448,402,0.472941
Chronicles of Narnia. The Last Battle.txt,1101,562,539,0.489555
"Chronicles of Narnia. The Lion, the Witch and the Wardrobe.txt",793,389,404,0.509458
Chronicles of Narnia. The Magicians Nephew.txt,1250,622,628,0.5024
Chronicles of Narnia. The Silver Chair.txt,1275,620,655,0.513725
Chronicles of Narnia. The Voyage of the Dawn Treader.txt,1203,595,608,0.505403
Fantastic Beasts and Where to Find Them.txt,765,416,349,0.456209
Fantastic Beasts. The Crimes of Grindelwald.txt,761,440,321,0.421813
Fantastic Beasts. The Secrets of Dumbledore.txt,635,360,275,0.433071


### Create frequesncy matrix

In [54]:
docs = pd.DataFrame.from_dict(documents, orient="index")
docs.columns = ['content']

In [55]:
cv = CountVectorizer()
matrix_tf = cv.fit_transform(docs['content'])

matrix_tf

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 6019 stored elements and shape (23, 2503)>

In [56]:
sparsity_tf = 1-(matrix_tf.getnnz())/(matrix_tf.shape[0]*matrix_tf.shape[1])
sparsity_tf

0.8954472024874498

In [57]:
tv = TfidfVectorizer()
matrix_tfidf = tv.fit_transform(docs['content'])
matrix_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6019 stored elements and shape (23, 2503)>

In [58]:
sparsity_tfidf = 1-(matrix_tfidf.getnnz()/(matrix_tfidf.shape[0]*matrix_tfidf.shape[1]))
sparsity_tfidf

0.8954472024874498

### Directory for results

In [60]:
if not os.path.exists("./wordclouds"):
    os.mkdir("./wordclouds")
if not os.path.exists("./topic_modelling"):
    os.mkdir("./topic_modelling")
if not os.path.exists("./topic_modelling/topics"):
    os.mkdir("./topic_modelling/topics")
if not os.path.exists("./topic_modelling/documents"):
    os.mkdir("./topic_modelling/documents")
if not os.path.exists("./clustering"):
    os.mkdir("./clustering")
if not os.path.exists("./ngrams"):
    os.mkdir("./ngrams")

#### Wordclouds

#### Topic modelling

#### Clustering

#### N-grams