# DS-SF-34 | 18 | Natural Language Processing | Codelong | Starter Code

## >>> One-time setup

In [3]:
#''''''
import nltk
nltk.download()
#''''''

pass

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## <<< One-time setup

## Part A | Tokenization and Stemming

In [2]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import string
import unicodedata
from nltk import tokenize, corpus, stem

from sklearn import feature_extraction, linear_model, ensemble, model_selection, metrics, decomposition

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

## Tokenization

In [4]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
def tokenize_text(document):
    document = document.encode('utf-8')

    # Convert text to lowercase
    document = document.lower()

    # Tokenize
    tokens = tokenize.word_tokenize(document)

    # Remove punctuation in tokens and then remove empty tokens
    tokens = [token.translate(None, string.punctuation) for token in tokens]
    tokens = [token for token in tokens if token]

    # Remove stop words
    tokens = [token for token in tokens if not token in corpus.stopwords.words('english')]

    return tokens

In [6]:
tokens = tokenize_text("This is a sentence...  Wait, here's another.  And a third!")

tokens

['sentence', 'wait', 'another', 'third']

## Stemming

In [7]:
class Stemmer:
    stemmer = stem.porter.PorterStemmer()

    @staticmethod
    def stem_tokens(tokens):
        return [Stemmer.stemmer.stem(token) for token in tokens]

In [8]:
tokens = Stemmer.stem_tokens(tokens)

tokens

[u'sentenc', 'wait', u'anoth', 'third']

## Part B | Book reviews

Below, we will be analyzing a partial list of the reviews for J.K. Rowling's The Casual Vacancy.  (https://www.amazon.com/dp/0316228532)  We scrapped this dataset during class 3.

In [9]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-18-reviews.csv'))

In [10]:
df

Unnamed: 0,date,id,author,title,body,star_rating
0,2017-04-21,R3TUANQ2EB3ECB,MichaelMichaels,Skip it. Life is too short.,I've never read any of the Harry Potter books ...,1.0
1,2017-04-20,R2DD03ZZ4218VW,Frans van Wyk,Four Stars,Excellent Read with a lot of real life values ...,4.0
2,2017-04-20,R296NVKLH5QS4W,Sabina Duke,Characters,Hard to keep the characters straight,4.0
3,2017-04-05,R3MP7W8LH6VHU8,Jen Blau,GIVE IT A CHANCE!,I almost put this book down. I'm new to Rowlin...,5.0
4,2017-04-04,RZWP48RKJCXT1,Lilith Eleanor,Frighteningly good,Amazing. Rowling combines fantastic writing wi...,5.0
...,...,...,...,...,...,...
5856,2012-09-27,RT2TE0W92SL67,Tricia K.,Seriously? $17 bucks for a computer file??? ...,Premise sounds dull as dirt. For $17 for a co...,1.0
5857,2012-09-27,R14ZGYPSP9H0Y7,Pretzel,A must read,The depth of character development and storyli...,5.0
5858,2012-09-27,R1913ISIDAGQ1A,Prodigy,I love it,The book was great and I will love to re-read ...,5.0
5859,2012-09-27,R2JY771IW7RI3R,David Katz,Kendle price too expensive,I started to order the kindle edition and than...,5.0


In [11]:
df.drop(['date', 'id', 'author', 'title'],
    axis = 1,
    inplace = True)

In [12]:
df

Unnamed: 0,body,star_rating
0,I've never read any of the Harry Potter books ...,1.0
1,Excellent Read with a lot of real life values ...,4.0
2,Hard to keep the characters straight,4.0
3,I almost put this book down. I'm new to Rowlin...,5.0
4,Amazing. Rowling combines fantastic writing wi...,5.0
...,...,...
5856,Premise sounds dull as dirt. For $17 for a co...,1.0
5857,The depth of character development and storyli...,5.0
5858,The book was great and I will love to re-read ...,5.0
5859,I started to order the kindle edition and than...,5.0


### `NaN`

In [13]:
# TODO
df.isnull().sum()

body           3
star_rating    0
dtype: int64

In [14]:
df.dropna(inplace = True)

### Positive, neutral, and negatives reviews

In [None]:
# TODO
#def polarity(star_rating):
   # if star_rating <= 2.0:
    #    star_rating = -1
    #elif star_rating > 2.0 and < 
 

In [17]:
df['polarity'] = df.star_rating.map({1: -1, 2: -1, 3: 0, 4: 1, 5: 1})

In [18]:
df.polarity.value_counts()

 1    2711
-1    2177
 0     970
Name: polarity, dtype: int64

In [19]:
ns = df.polarity.value_counts()
ns.min()

970

In [20]:
for polarity in [-1, 0, 1]:
    n = ns[polarity] - ns.min()
    index = df[df.polarity == polarity].sample(n = n, random_state = 0).index
    df.drop(index, inplace = True)

In [21]:
df.polarity.value_counts()

 1    970
-1    970
 0    970
Name: polarity, dtype: int64

### Feature matrix and response vector

In [22]:
# TODO
X = df.body
c = df.polarity

### Train/test sets

In [23]:
train_X, test_X, train_c, test_c = model_selection.train_test_split(X, c, stratify = c, train_size = .6, random_state = 0)

### TF-IDF and `TfidfVectorizer`

In [24]:
# TODO
vectorizer = feature_extraction.text.TfidfVectorizer(stop_words = 'english')

In [32]:
class CustomTokenizer(object):
    def __call__(self, document):
        tokens = tokenize_text(document)
        tokens = Stemmer.stem_tokens(tokens)
        return tokens

In [33]:
vectorizer = feature_extraction.text.TfidfVectorizer(tokenizer = CustomTokenizer())

In [34]:
vectorizer.fit(train_X)

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<__main__.CustomTokenizer object at 0x1196e1450>,
        use_idf=True, vocabulary=None)

### Bag-of-words

In [35]:
vectorizer.get_feature_names()

['012315',
 '08',
 '1',
 '10',
 '100',
 '1000',
 '1012',
 '105',
 '11',
 '110',
 '12',
 '120',
 '13',
 '130',
 '132',
 '14',
 '142',
 '143',
 '149',
 '1495',
 '1499',
 '15',
 '150',
 '17',
 '170',
 '175',
 '1799',
 '18',
 '1860',
 '18th',
 '1950',
 u'1960',
 '1984',
 '19th',
 '1star',
 '2',
 '20',
 '200',
 '2012',
 '2015',
 '2016',
 '21',
 '21st',
 '23',
 '230am',
 '236',
 '24',
 '25',
 '250',
 '27',
 '28',
 '2nd',
 '3',
 '30',
 '300',
 '3000',
 '31',
 '32',
 '34',
 '35',
 '355',
 '380',
 '3d',
 '3rd',
 '4',
 '40',
 '400',
 '40ish',
 '412star',
 '44',
 '45',
 '450',
 '5',
 '50',
 '500',
 '500th',
 '503',
 '505',
 '50th',
 '56',
 '57',
 '6',
 '60',
 '600',
 '6080',
 '62',
 '6th',
 '7',
 '70',
 '72',
 '75',
 '77',
 '8',
 '80',
 '800',
 '89',
 '90',
 '92',
 '93',
 '98',
 '9997',
 'aand',
 'aback',
 'abandon',
 u'abil',
 'abject',
 u'abl',
 u'abnorm',
 u'abort',
 'abound',
 u'aboutth',
 'abraham',
 'abrupt',
 u'abruptli',
 u'absenc',
 u'absentmindedli',
 u'absolut',
 u'absorb',
 u'absurd',

In [36]:
len(vectorizer.get_feature_names())

4621

### Transformed feature matrix `X`

In [37]:
# TODO
train_X = vectorizer.transform(train_X)
test_X = vectorizer.transform(test_X)

In [39]:
train_X.todense()

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

### Machine Learning Modeling

> # TODO...

In [67]:
model = linear_model.LogisticRegression(random_state = 0).fit(train_X, train_c)
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [49]:
model.score(train_X, train_c)

0.86254295532646053

In [52]:
from sklearn import tree

In [58]:
model = tree.DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 5, random_state = 0,\
                                   ).fit(train_X, train_c)

In [56]:
model.score(train_X, train_c)

0.57731958762886593

In [64]:
from sklearn.ensemble import RandomForestClassifier

In [71]:
model = RandomForestClassifier(n_estimators = 100, max_depth = 10, min_samples_leaf = 5, random_state = 0,\
                                   ).fit(train_X, train_c)
model.score(train_X, train_c)

0.72852233676975942

In [72]:
model.score(test_X, test_c)

0.53608247422680411