# TXT PURIFIER

This notebook was posted by Simon Lindgren // [@simonlindgren](http://www.twitter.com/simonlindgren) // [simonlindgren.com](http://simonlindgren.com)

The following code is about how purify a list of documents by removing their most common words to reveal the words that truly distinguish the documents from each other.

Required Python packages are `gensim` and `pandas`.

In [None]:
from gensim import corpora, models, similarities
import pandas as pd

## Text input

In [None]:
f = open("docs.txt", 'r') # a txt file with one document per line
documents = f.readlines() # documents is now a Python list of documents

In [None]:
len(documents) # number of documents

In [None]:
print(documents[0][:100]) # inspect the beginning of the first document

## Stop word removal etc.

In [None]:
# remove stop words, tokenize, and convert to lowercase
stoplist = set('your stop words here'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]

In [None]:
# remove project-specific stop words
stoplist2 = set('another set of stop words here'.split())
texts = [[word for word in text if word not in stoplist2] for text in texts]

In [None]:
# remove words that appear less than X (e.g. 2) time(s)
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 2] for text in texts]

In [None]:
# remove anything which is not pure letters
# the method isalpha() checks whether the string consists of alphabetic characters only

texts = [[token for token in text if token.isalpha()] for text in texts]

In [None]:
# remove one-letter words
texts = [[token for token in text if len(token) > 1] for text in texts]

In [None]:
print(texts[0][:40]) # see the beginning of the first tokenized and cleaned document

## tf-idf stuff

In [None]:
# we now create a gensim corpus from this set of documents
# to be able to get tf-idf scores for words

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(doc) for doc in texts]
tfidf = models.TfidfModel(corpus, id2word = dictionary)
corpus_tfidf = tfidf[corpus]

low_value = 0.25

for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    low_value_words = [] #reinitialize to be safe. You can skip this.
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    new_bow = [b for b in bow if b[0] not in low_value_words]

d = {dictionary.get(id): value for doc in corpus_tfidf for id, value in doc} # a dictionary of the tfidf values

In [None]:
d

In [None]:
# Read the dictionary into a Pandas dataframe and sort descending based on tf-idf
df = pd.DataFrame([[key,value] for key,value in d.items()],columns=["word","tf-idf"])
df = df.sort_values(['tf-idf'], ascending=[False])
df

In [None]:
tfidf_threshold = 0.058 # set manually (experiment and iterate)

df2 = df.loc[df['tf-idf'] > tfidf_threshold]
print(str(len(df2)) + ' are left of ' + str(len(df)))

In [None]:
# extract the word column from df2 as a list
# this is a list of all words with tf-idf above the threshold

keep_words = df2['word'].tolist()

In [None]:
# remove the low tf-idf words that are not to be kept

texts = [[word for word in text if word in keep_words] for text in texts]

## Text output

In [None]:
# 'texts' is now a list of lists of tokens
# we transform it back to the initial format (a list of documents)

doc_list = [] # initialise an empty list

for token_list in texts:
    #print(token_list)
    token_string = ' '.join(token_list)
    #print(token_string)
    #print("==========")
    doc_list.append(token_string)

In [None]:
# remove duplicates, if any, from the doc_list

doc_list2= set(doc_list)
print(str(len(doc_list)-len(doc_list2)) + " duplicate documents removed.")
doc_list = doc_list2

In [None]:
# how many documents are left?
len(doc_list2)

In [None]:
# write the documents as lines to a new txt file
with open('docs.txt', 'w') as outfile:
    for item in doc_list:
        outfile.write("%s\n" % item)