In [None]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer,porter
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import string
import pandas as pd
import numpy as np
import re

from IPython.display import display
import matplotlib.pyplot as plt

# Define pre-processing functions for text

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(",".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

def top_words(model,feature_names,n_top_words):
    topics = []
    terms = []
    for topic_idx, topic in enumerate(model.components_):
        topics.append(topic_idx)
        terms.append(','.join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    return pandas.DataFrame({'topic':topics,'terms':terms},columns=['topic','terms'])    

def noPunct(txt):
    punct_list = string.punctuation + '’' + '”' + '“'
    for punct in punct_list:
        txt = txt.replace(punct,'')
    return txt

def noNumber(txt):
    return re.sub(r'[$\s]*\d+\.?\d*','',txt)

def addNegation(x):
    pattern = r'not (.+)[' + string.punctuation + r']'
    q = re.search(pattern,x,flags=re.IGNORECASE)
    if q is None:
        return x
    replacement = ['not_' + i for i in q.group(1).split(' ')]
    replacement = ' '.join(replacement)
    return x.replace(q.group(),replacement)

def LemmaTokenizer(doc):
    #doc = addNegation(doc)
    wnl = WordNetLemmatizer()
    lemmas =[wnl.lemmatize(t) for t in word_tokenize(doc)]
    lemmas = [noPunct(l) for l in lemmas]
    lemmas = [noNumber(l) for l in lemmas]
    lemmas = list(filter(lambda x: len(x) > 0,lemmas))
    return lemmas

def StemmerTokenizer(doc):
    #doc = addNegation(doc)
    port = porter.PorterStemmer()
    tokens =[port.stem(t) for t in word_tokenize(doc)]
    tokens = [noPunct(t) for t in tokens]
    tokens = [noNumber(l) for l in tokens]
    tokens = list(filter(lambda x: len(x) > 0,tokens))
    return tokens

extra_stopwords = stopwords.words('english') + ['company','would','get','school','student','sally','salliemae','im','sallie','mae','loan','nt','wo','wa','tg','va','ca','mo','le','ha','sm','itt','k','smae']
extra_stopwords = extra_stopwords + ['not_' + x for x in extra_stopwords]

In [8]:
train = pd.read_csv("train.csv",header=None,names=["score","review"],dtype={"score":np.int,"review":np.str})
train.head()

Unnamed: 0,score,review
0,5,dr. goldberg offers everything i look for in a...
1,2,"Unfortunately, the frustration of being Dr. Go..."
2,4,Been going to Dr. Goldberg for over 10 years. ...
3,4,Got a letter in the mail last week that said D...
4,1,I don't know what Dr. Goldberg was like before...
