In [67]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
import string
import re

In [63]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/muciyu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [86]:
!ls data/train_test/files/text*subset

1938.txt 1940.txt 348.txt  381.txt  385.txt  485.txt
1939.txt 347.txt  349.txt  384.txt  483.txt  486.txt


### Read Text + Lower all Characters + Remove Punctuation

In [112]:
file_path = 'data/train_test/files/text subset/'
file = file_path +'384.txt'
shakes = open(file, 'r')
text = shakes.read().replace("-\n", "").replace("\n", " ")
lowers = text.lower()

no_num = re.sub(r'\d+', '', lowers)

tokenizer = RegexpTokenizer(r'\w+')
text_tokens = tokenizer.tokenize(no_num)

### Remove Stopwords

In [113]:
manual_stopwords = ['www', 'org', 'vol']
filtered = [w for w in text_tokens if not w in stopwords.words('english')+manual_stopwords]
filtered2 = [c for c in filtered if len(c) > 2]
count = Counter(filtered2)
print (count.most_common(100))

[('vitamin', 106), ('folate', 76), ('acid', 53), ('folic', 46), ('serum', 46), ('status', 38), ('deficiency', 31), ('high', 31), ('fortification', 29), ('anemia', 25), ('cognitive', 23), ('low', 21), ('normal', 21), ('intake', 19), ('homocysteine', 18), ('pmol', 17), ('interaction', 16), ('concentrations', 16), ('impairment', 15), ('nmol', 15), ('clin', 14), ('nutr', 14), ('subjects', 13), ('data', 12), ('nhanes', 12), ('among', 12), ('selhub', 11), ('mean', 11), ('food', 11), ('health', 10), ('nutrition', 10), ('mma', 10), ('effect', 10), ('relation', 9), ('pernicious', 9), ('plasma', 9), ('prevalence', 9), ('participants', 9), ('survey', 8), ('jacques', 7), ('rosenberg', 7), ('category', 7), ('iii', 7), ('suppl', 7), ('thf', 7), ('furthermore', 7), ('results', 7), ('although', 7), ('elderly', 6), ('national', 6), ('methylmalonic', 6), ('function', 6), ('circulating', 6), ('associated', 6), ('blood', 6), ('also', 6), ('liver', 6), ('extract', 6), ('factor', 6), ('neurologic', 6), ('ex

### Stemming

In [114]:
from nltk.stem.porter import *

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

stemmer = PorterStemmer()
stemmed = stem_tokens(filtered2, stemmer)
count = Counter(stemmed)
print (count.most_common(100))

[('vitamin', 108), ('folat', 77), ('acid', 53), ('folic', 46), ('serum', 46), ('statu', 38), ('defici', 37), ('high', 31), ('fortif', 29), ('anemia', 27), ('cognit', 26), ('normal', 22), ('intak', 21), ('low', 21), ('impair', 19), ('interact', 18), ('concentr', 18), ('homocystein', 18), ('pmol', 17), ('nmol', 15), ('increas', 14), ('clin', 14), ('nutr', 14), ('effect', 14), ('subject', 13), ('relat', 12), ('data', 12), ('nhane', 12), ('among', 12), ('food', 12), ('selhub', 11), ('use', 11), ('nutrit', 11), ('mean', 11), ('particip', 11), ('result', 11), ('consequ', 11), ('health', 10), ('mma', 10), ('pernici', 9), ('plasma', 9), ('preval', 9), ('associ', 9), ('supplement', 9), ('suggest', 8), ('survey', 8), ('categori', 8), ('function', 8), ('factor', 8), ('jacqu', 7), ('rosenberg', 7), ('report', 7), ('iii', 7), ('suppl', 7), ('thf', 7), ('furthermor', 7), ('studi', 7), ('although', 7), ('neurolog', 7), ('advers', 6), ('affect', 6), ('elderli', 6), ('observ', 6), ('nation', 6), ('meth

### Putting things together

In [99]:
def gen_token(file, manual_stopwords = ['www', 'org', 'vol'])
    '''
    INPUT
    file: path of a text file
    manual_stopwords: additional stopwords, set manually
    
    Output: 
    Preprocessed and tokenized text
    '''
    
    #read text file
    raw = open(file, 'r')
    text = raw.read().replace("-\n", "").replace("\n", " ")
    
    #lower all letters
    lowers = text.lower()
    
    #remove all numbers
    no_num = re.sub(r'\d+', '', lowers)

    #tokenize text
    tokenizer = RegexpTokenizer(r'\w+')
    text_tokens = tokenizer.tokenize(no_num)
    
    #remove stopwords
    filtered = [w for w in text_tokens if not w in stopwords.words('english')+manual_stopwords]
    filtered2 = [c for c in filtered if len(c) > 2]
    
    #stemming
    stemmer = PorterStemmer()
    stemmed = stem_tokens(filtered2, stemmer)

('data/train_test/files/text subset/', [], ['1939.txt', '1938.txt', '349.txt', '348.txt', '381.txt', '385.txt', '347.txt', '384.txt', '1940.txt', '486.txt', '485.txt', '483.txt'])


In [101]:
file_path

'data/train_test/files/text subset/'