## IMPORTED PACKAGES ##

In [None]:
import os
import json
import string
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import PlaintextCorpusReader , stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Create document corpus #

In [None]:
corpus_dir = "./Literature-original"
corpus = PlaintextCorpusReader(corpus_dir, ".*\\.txt")
files_names = corpus.fileids()
files_names


### Corpus documents preprocesing

In [None]:
documents = {}
for file_name in files_names:
    documents[file_name] = corpus.raw(file_name)
print( json.dumps(documents,indent=4,ensure_ascii=False))

In [None]:
lengths = {}
for file_name in documents:
   lengths[file_name] = {
      "pre": len(word_tokenize(documents[file_name]))
   }
print(json.dumps(lengths,indent=4,ensure_ascii=False))

In [None]:
ps = PorterStemmer()

In [None]:
for file_name in documents:
    documents[file_name] = documents[file_name].lower()
    documents[file_name] = "".join([char for char in documents[file_name] if char not in string.punctuation])
    documents[file_name] = "".join([char for char in documents[file_name] if not char.isdigit()])
    documents[file_name] = " ".join([ps.stem(word) for word in word_tokenize(documents[file_name])])
    documents[file_name] = " ".join([word for word in word_tokenize(documents[file_name]) if word not in list(stopwords.words('english'))])
print(json.dumps(documents,indent=4,ensure_ascii=False))

In [None]:
for file_name in documents:
    lengths[file_name]["post"] = len(word_tokenize(documents[file_name]))
print(json.dumps(lengths,indent=4,ensure_ascii=False))



In [None]:
lengths = pd.DataFrame.from_dict(lengths, orient="index")

In [None]:
lengths["diff"] = lengths["pre"] - lengths["post"]
lengths['pct'] = lengths["diff"] / lengths["pre"]
lengths

### Create frequency matrix


In [None]:
docs = pd.DataFrame.from_dict(documents, orient="index")
docs.columns = ["content"]
docs

In [None]:
cv = CountVectorizer()
matrix_tf = cv.fit_transform(docs["content"])
matrix_tf

In [None]:
sparsity_tf = 1 - (matrix_tf.getnnz()/(matrix_tf.shape[0]*matrix_tf.shape[1]))
sparsity_tf

In [None]:
tv =TfidfVectorizer()
matrix_tfidf = tv.fit_transform(docs["content"])
matrix_tfidf

In [None]:
sparsity_tfidf = 1 - (matrix_tfidf.getnnz()/(matrix_tfidf.shape[0]*matrix_tfidf.shape[1]))
sparsity_tfidf

### DIrectories for results



In [None]:
### wordclouds
if not  os.path.exists("./wordclouds"):
    os.mkdir("./wordclouds")
## topic modeling
if not  os.path.exists("./topic_modeling"):
    os.mkdir("./topic_modeling")
if not  os.path.exists("./topic_modeling/topics"):
    os.mkdir("./topic_modeling/topics")
if not  os.path.exists("./topic_modeling/documents"):
    os.mkdir("./topic_modeling/documents")
### clustering
if not  os.path.exists("./clustering"):
    os.mkdir("./clustering")
### ngrams
if not  os.path.exists("./ngrams"):
    os.mkdir("./ngrams")


### Wordclouds


### Topic modeling


### Clustering

### N-grams
