In [1]:
import time
import random
from math import *
import operator
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 10000)

# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from matplotlib import style
%matplotlib inline 

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

# load make_blobs to simulate data
from sklearn.datasets import make_blobs
from sklearn.datasets import make_classification
from sklearn.datasets import make_regression

# import the ML algorithm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from statsmodels.tools.eval_measures import rmse
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans

# For text processing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# pre-processing
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing.data import QuantileTransformer
from sklearn.preprocessing import Imputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV

# import libraries for model validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut 

# import libraries for metrics and reporting
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import adjusted_rand_score

In [2]:
texts = [
    "Penny bought bright blue fishes. !!",
    "Penny bought bright blue and orange fish.",
    "The cat ate a fish at the store.",
    "Penny went to the store. Penny ate a bug. Penny saw a fish.",
    "It meowed once at the bug, it is still meowing at the bug and the fish",
    "The cat is at the fish store. The cat is orange. The cat is meowing at the fish.",
    "Penny is a fish"
]

In [3]:
# default 
# - n-grams = 1
# - stopwords filterations : false
# - lowercase : yes

# instantiate the count vectorizer
vect_tfidf = TfidfVectorizer()

In [4]:
# train (Bow) 
vect_tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [5]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidf.vocabulary_)))

feature_names = vect_tfidf.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidf.vocabulary_))

Vocabulary size: 23
['and', 'at', 'ate', 'blue', 'bought', 'bright', 'bug', 'cat', 'fish', 'fishes', 'is', 'it', 'meowed', 'meowing', 'once', 'orange', 'penny', 'saw', 'still', 'store', 'the', 'to', 'went']
Vocabulary content:
 {'penny': 16, 'bought': 4, 'bright': 5, 'blue': 3, 'fishes': 9, 'and': 0, 'orange': 15, 'fish': 8, 'the': 20, 'cat': 7, 'ate': 2, 'at': 1, 'store': 19, 'went': 22, 'to': 21, 'bug': 6, 'saw': 17, 'it': 11, 'meowed': 12, 'once': 14, 'is': 10, 'still': 18, 'meowing': 13}


In [6]:
# prepare dtm
X_train_tfidf_dtm = vect_tfidf.transform(texts)

In [7]:
print(type(X_train_tfidf_dtm))
print(X_train_tfidf_dtm.toarray())

<class 'scipy.sparse.csr.csr_matrix'>
[[0.         0.         0.         0.4471231  0.4471231  0.4471231
  0.         0.         0.         0.53864679 0.         0.
  0.         0.         0.         0.         0.33181688 0.
  0.         0.         0.         0.         0.        ]
 [0.41245597 0.         0.         0.41245597 0.41245597 0.41245597
  0.         0.         0.23602831 0.         0.         0.
  0.         0.         0.         0.41245597 0.30608987 0.
  0.         0.         0.         0.         0.        ]
 [0.         0.34919976 0.40853218 0.         0.         0.
  0.         0.40853218 0.23378292 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.34919976 0.60635593 0.         0.        ]
 [0.         0.         0.27821481 0.         0.         0.
  0.27821481 0.         0.15920869 0.         0.         0.
  0.         0.         0.         0.         0.61940238 0.33516389
  0.         0.2378088  0.20646746 0.33516389

In [8]:
# create a dataframe
pd.DataFrame(X_train_tfidf_dtm.toarray(), columns=feature_names)

Unnamed: 0,and,at,ate,blue,bought,bright,bug,cat,fish,fishes,is,it,meowed,meowing,once,orange,penny,saw,still,store,the,to,went
0,0.0,0.0,0.0,0.447123,0.447123,0.447123,0.0,0.0,0.0,0.538647,0.0,0.0,0.0,0.0,0.0,0.0,0.331817,0.0,0.0,0.0,0.0,0.0,0.0
1,0.412456,0.0,0.0,0.412456,0.412456,0.412456,0.0,0.0,0.236028,0.0,0.0,0.0,0.0,0.0,0.0,0.412456,0.30609,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.3492,0.408532,0.0,0.0,0.0,0.0,0.408532,0.233783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3492,0.606356,0.0,0.0
3,0.0,0.0,0.278215,0.0,0.0,0.0,0.278215,0.0,0.159209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.619402,0.335164,0.0,0.237809,0.206467,0.335164,0.335164
4,0.199616,0.341251,0.0,0.0,0.0,0.0,0.399232,0.0,0.114231,0.0,0.170625,0.480953,0.240476,0.199616,0.240476,0.0,0.0,0.0,0.240476,0.0,0.444415,0.0,0.0
5,0.0,0.283715,0.0,0.0,0.0,0.0,0.0,0.497881,0.189942,0.0,0.425573,0.0,0.0,0.16596,0.0,0.16596,0.0,0.0,0.0,0.141858,0.615809,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.451161,0.0,0.673895,0.0,0.0,0.0,0.0,0.0,0.585081,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# N-grams (sets of consecutive words) N=2
# instantiate the count vectorizer
vect_tfidf = TfidfVectorizer(ngram_range=(1, 2))

In [15]:
# train (Bow) 
vect_tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [16]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidf.vocabulary_)))

feature_names = vect_tfidf.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidf.vocabulary_))

Vocabulary size: 66
['and', 'and orange', 'and the', 'at', 'at the', 'ate', 'ate bug', 'ate fish', 'blue', 'blue and', 'blue fishes', 'bought', 'bought bright', 'bright', 'bright blue', 'bug', 'bug and', 'bug it', 'bug penny', 'cat', 'cat ate', 'cat is', 'fish', 'fish at', 'fish store', 'fishes', 'is', 'is at', 'is fish', 'is meowing', 'is orange', 'is still', 'it', 'it is', 'it meowed', 'meowed', 'meowed once', 'meowing', 'meowing at', 'once', 'once at', 'orange', 'orange fish', 'orange the', 'penny', 'penny ate', 'penny bought', 'penny is', 'penny saw', 'penny went', 'saw', 'saw fish', 'still', 'still meowing', 'store', 'store penny', 'store the', 'the', 'the bug', 'the cat', 'the fish', 'the store', 'to', 'to the', 'went', 'went to']
Vocabulary content:
 {'penny': 44, 'bought': 11, 'bright': 13, 'blue': 8, 'fishes': 25, 'penny bought': 46, 'bought bright': 12, 'bright blue': 14, 'blue fishes': 10, 'and': 0, 'orange': 41, 'fish': 22, 'blue and': 9, 'and orange': 1, 'orange fish': 42,

In [17]:
# prepare dtm
X_train_tfidf_dtm = vect_tfidf.transform(texts)

In [18]:
# create a dataframe
pd.DataFrame(X_train_tfidf_dtm.toarray(), columns=feature_names)

Unnamed: 0,and,and orange,and the,at,at the,ate,ate bug,ate fish,blue,blue and,blue fishes,bought,bought bright,bright,bright blue,bug,bug and,bug it,bug penny,cat,cat ate,cat is,fish,fish at,fish store,fishes,is,is at,is fish,is meowing,is orange,is still,it,it is,it meowed,meowed,meowed once,meowing,meowing at,once,once at,orange,orange fish,orange the,penny,penny ate,penny bought,penny is,penny saw,penny went,saw,saw fish,still,still meowing,store,store penny,store the,the,the bug,the cat,the fish,the store,to,to the,went,went to
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.325243,0.0,0.391819,0.325243,0.325243,0.325243,0.325243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.391819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.241368,0.0,0.325243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.274907,0.331179,0.0,0.0,0.0,0.0,0.0,0.0,0.274907,0.331179,0.0,0.274907,0.274907,0.274907,0.274907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.274907,0.331179,0.0,0.204013,0.0,0.274907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.236378,0.236378,0.276541,0.0,0.333148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.276541,0.333148,0.0,0.158251,0.333148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.236378,0.0,0.0,0.410451,0.0,0.276541,0.0,0.276541,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.192518,0.231926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192518,0.0,0.0,0.231926,0.0,0.0,0.0,0.110169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.428612,0.231926,0.0,0.0,0.231926,0.231926,0.231926,0.231926,0.0,0.0,0.164558,0.231926,0.0,0.142871,0.0,0.0,0.0,0.192518,0.231926,0.231926,0.231926,0.231926
4,0.143024,0.0,0.172301,0.244505,0.244505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.286049,0.172301,0.172301,0.0,0.0,0.0,0.0,0.081846,0.0,0.0,0.0,0.122252,0.0,0.0,0.0,0.0,0.172301,0.344601,0.172301,0.172301,0.172301,0.172301,0.143024,0.143024,0.172301,0.172301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.172301,0.172301,0.0,0.0,0.0,0.318422,0.344601,0.0,0.143024,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.197401,0.197401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.346413,0.0,0.417322,0.132157,0.0,0.139107,0.0,0.296102,0.139107,0.0,0.139107,0.139107,0.0,0.0,0.0,0.0,0.0,0.0,0.115471,0.115471,0.0,0.0,0.115471,0.0,0.139107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.098701,0.0,0.139107,0.428464,0.0,0.346413,0.230942,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26942,0.0,0.0,0.0,0.402431,0.0,0.56718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.349394,0.0,0.0,0.56718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# N=3
# instantiate the count vectorizer
vect_tfidf = TfidfVectorizer(ngram_range=(1, 3))

In [20]:
# train (Bow) 
vect_tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [21]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidf.vocabulary_)))

feature_names = vect_tfidf.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidf.vocabulary_))

Vocabulary size: 112
['and', 'and orange', 'and orange fish', 'and the', 'and the fish', 'at', 'at the', 'at the bug', 'at the fish', 'at the store', 'ate', 'ate bug', 'ate bug penny', 'ate fish', 'ate fish at', 'blue', 'blue and', 'blue and orange', 'blue fishes', 'bought', 'bought bright', 'bought bright blue', 'bright', 'bright blue', 'bright blue and', 'bright blue fishes', 'bug', 'bug and', 'bug and the', 'bug it', 'bug it is', 'bug penny', 'bug penny saw', 'cat', 'cat ate', 'cat ate fish', 'cat is', 'cat is at', 'cat is meowing', 'cat is orange', 'fish', 'fish at', 'fish at the', 'fish store', 'fish store the', 'fishes', 'is', 'is at', 'is at the', 'is fish', 'is meowing', 'is meowing at', 'is orange', 'is orange the', 'is still', 'is still meowing', 'it', 'it is', 'it is still', 'it meowed', 'it meowed once', 'meowed', 'meowed once', 'meowed once at', 'meowing', 'meowing at', 'meowing at the', 'once', 'once at', 'once at the', 'orange', 'orange fish', 'orange the', 'orange the c

In [22]:
# prepare dtm
X_train_tfidf_dtm = vect_tfidf.transform(texts)

In [20]:
# create a dataframe
pd.DataFrame(X_train_tfidf_dtm.toarray(), columns=feature_names)

Unnamed: 0,and,and orange,and orange fish,and the,and the fish,at,at the,at the bug,at the fish,at the store,ate,ate bug,ate bug penny,ate fish,ate fish at,blue,blue and,blue and orange,blue fishes,bought,bought bright,bought bright blue,bright,bright blue,bright blue and,bright blue fishes,bug,bug and,bug and the,bug it,bug it is,bug penny,bug penny saw,cat,cat ate,cat ate fish,cat is,cat is at,cat is meowing,cat is orange,fish,fish at,fish at the,fish store,fish store the,fishes,is,is at,is at the,is fish,is meowing,is meowing at,is orange,is orange the,is still,is still meowing,it,it is,it is still,it meowed,it meowed once,meowed,meowed once,meowed once at,meowing,meowing at,meowing at the,once,once at,once at the,orange,orange fish,orange the,orange the cat,penny,penny ate,penny ate bug,penny bought,penny bought bright,penny is,penny is fish,penny saw,penny saw fish,penny went,penny went to,saw,saw fish,still,still meowing,still meowing at,store,store penny,store penny ate,store the,store the cat,the,the bug,the bug and,the bug it,the cat,the cat ate,the cat is,the fish,the fish store,the store,the store penny,to,to the,to the store,went,went to,went to the
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.278373,0.0,0.0,0.335355,0.278373,0.278373,0.278373,0.278373,0.278373,0.0,0.335355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.335355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.206585,0.0,0.0,0.278373,0.278373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.225958,0.27221,0.27221,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.225958,0.27221,0.27221,0.0,0.225958,0.225958,0.225958,0.225958,0.225958,0.27221,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.225958,0.27221,0.0,0.0,0.167687,0.0,0.0,0.225958,0.225958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.189562,0.189562,0.0,0.0,0.267166,0.22177,0.0,0.0,0.267166,0.267166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22177,0.267166,0.267166,0.0,0.0,0.0,0.0,0.126908,0.267166,0.267166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.189562,0.0,0.0,0.0,0.0,0.329158,0.0,0.0,0.0,0.22177,0.267166,0.0,0.0,0.0,0.22177,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15803,0.190378,0.190378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15803,0.0,0.0,0.0,0.0,0.190378,0.190378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.351829,0.190378,0.190378,0.0,0.0,0.0,0.0,0.190378,0.190378,0.190378,0.190378,0.190378,0.190378,0.0,0.0,0.0,0.135079,0.190378,0.190378,0.0,0.0,0.117276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15803,0.190378,0.190378,0.190378,0.190378,0.190378,0.190378,0.190378
4,0.118135,0.0,0.0,0.142316,0.142316,0.201955,0.201955,0.284632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.236269,0.142316,0.142316,0.142316,0.142316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067603,0.0,0.0,0.0,0.0,0.0,0.100978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142316,0.142316,0.284632,0.142316,0.142316,0.142316,0.142316,0.142316,0.142316,0.142316,0.118135,0.118135,0.118135,0.142316,0.142316,0.142316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142316,0.142316,0.142316,0.0,0.0,0.0,0.0,0.0,0.263009,0.284632,0.142316,0.142316,0.0,0.0,0.0,0.118135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.16346,0.16346,0.0,0.230378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.28685,0.0,0.0,0.345567,0.115189,0.115189,0.115189,0.109434,0.0,0.0,0.115189,0.115189,0.0,0.24519,0.115189,0.115189,0.0,0.115189,0.115189,0.115189,0.115189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095617,0.095617,0.095617,0.0,0.0,0.0,0.095617,0.0,0.115189,0.115189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08173,0.0,0.0,0.115189,0.115189,0.354793,0.0,0.0,0.0,0.28685,0.0,0.345567,0.191233,0.115189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23435,0.0,0.0,0.0,0.0,0.0,0.350047,0.0,0.0,0.49335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.303913,0.0,0.0,0.0,0.0,0.49335,0.49335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Min_df

# Min_df ignores terms that have a document frequency (presence in % of documents) strictly lower 
# than the given threshold. 
# For example, Min_df=0.66 requires that a term appear in 66% of the docuemnts for it to be considered 
# part of the vocabulary.

In [22]:
# Sometimes min_df is used to limit the vocabulary size, so it learns only those terms that appear 
# in at least 10%, 20%, etc. of the documents.

In [23]:
# instantiate the tfidf vectorizer
vect_tfidf = TfidfVectorizer(ngram_range=(1, 1), max_df=1.0, min_df=0.2, max_features=None)

In [24]:
# train (Bow) 
vect_tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [25]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidf.vocabulary_)))

feature_names = vect_tfidf.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidf.vocabulary_))

Vocabulary size: 15
['and', 'at', 'ate', 'blue', 'bought', 'bright', 'bug', 'cat', 'fish', 'is', 'meowing', 'orange', 'penny', 'store', 'the']
Vocabulary content:
 {'penny': 12, 'bought': 4, 'bright': 5, 'blue': 3, 'and': 0, 'orange': 11, 'fish': 8, 'the': 14, 'cat': 7, 'ate': 2, 'at': 1, 'store': 13, 'bug': 6, 'is': 9, 'meowing': 10}


In [26]:
# Max_df

# When building the vocabulary, it ignores terms that have a document frequency strictly higher 
# than the given threshold. 

# This could be used to exclude terms that are too frequent and are 
# unlikely to help predict the label. 

# For example, by analyzing reviews on the movie Lion King, 
# the term 'Lion' might appear in 90% of the reviews (documents), in which case, we could 
# consider establishing Max_df=0.89

In [28]:
# instantiate the tfidf vectorizer
vect_tfidfcv = TfidfVectorizer(ngram_range=(1, 1), max_df=0.5, min_df=0.2, max_features=None)

In [29]:
# train (Bow) 
vect_tfidfcv.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=0.2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [30]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidfcv.vocabulary_)))

feature_names = vect_tfidfcv.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidfcv.vocabulary_))

Vocabulary size: 12
['and', 'at', 'ate', 'blue', 'bought', 'bright', 'bug', 'cat', 'is', 'meowing', 'orange', 'store']
Vocabulary content:
 {'bought': 4, 'bright': 5, 'blue': 3, 'and': 0, 'orange': 10, 'cat': 7, 'ate': 2, 'at': 1, 'store': 11, 'bug': 6, 'is': 8, 'meowing': 9}


In [31]:
# Max_features

# Limit the amount of features (vocabulary) that the vectorizer will learn

In [33]:
# instantiate the tfidf vectorizer
vect_tfidf = TfidfVectorizer(ngram_range=(1, 1), max_df=0.5, min_df=0.2, max_features=6)

In [34]:
# train (Bow) 
vect_tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=6, min_df=0.2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [35]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidf.vocabulary_)))

feature_names = vect_tfidf.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidf.vocabulary_))

Vocabulary size: 6
['and', 'at', 'bug', 'cat', 'is', 'store']
Vocabulary content:
 {'and': 0, 'cat': 3, 'at': 1, 'store': 5, 'bug': 2, 'is': 4}


In [37]:
# stopwords

# instantiate the count vectorizer
vect_tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', max_features=None)

In [38]:
# train (Bow) 
vect_tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [39]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidf.vocabulary_)))

feature_names = vect_tfidf.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidf.vocabulary_))

Vocabulary size: 15
['ate', 'blue', 'bought', 'bright', 'bug', 'cat', 'fish', 'fishes', 'meowed', 'meowing', 'orange', 'penny', 'saw', 'store', 'went']
Vocabulary content:
 {'penny': 11, 'bought': 2, 'bright': 3, 'blue': 1, 'fishes': 7, 'orange': 10, 'fish': 6, 'cat': 5, 'ate': 0, 'store': 13, 'went': 14, 'bug': 4, 'saw': 12, 'meowed': 8, 'meowing': 9}


In [40]:
# notice the lack of stemming .. fish and fishes, meowed	meowing

# CountVectorizer can 
# - lowercase letters, 
# - disregard punctuation and 
# - stopwords

# but it can't LEMMATIZE or STEM

In [41]:
# create the stemmer object
porter_stemmer = PorterStemmer()
# print(porter_stemmer.stem("fish"))
# print(porter_stemmer.stem("fishes"))
# print(porter_stemmer.stem("meowed"))
# print(porter_stemmer.stem("meowing"))

In [42]:
# Use NLTK's PorterStemmer
def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [45]:
# instantiate the count vectorizer
vect_tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', tokenizer=stemming_tokenizer, max_features=None)

In [46]:
# train (Bow) 
vect_tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function stemming_tokenizer at 0x00000228997039D8>,
        use_idf=True, vocabulary=None)

In [47]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_tfidf.vocabulary_)))

feature_names = vect_tfidf.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_tfidf.vocabulary_))

Vocabulary size: 14
['ate', 'blue', 'bought', 'bright', 'bug', 'cat', 'fish', 'meow', 'onc', 'orang', 'penni', 'saw', 'store', 'went']
Vocabulary content:
 {'penni': 10, 'bought': 2, 'bright': 3, 'blue': 1, 'fish': 6, 'orang': 9, 'cat': 5, 'ate': 0, 'store': 12, 'went': 13, 'bug': 4, 'saw': 11, 'meow': 7, 'onc': 8}


In [48]:
# prepare dtm
X_train_tfidf_dtm = vect_tfidf.transform(texts)

In [49]:
# create a dataframe
pd.DataFrame(X_train_tfidf_dtm.toarray(), columns=feature_names)

Unnamed: 0,ate,blue,bought,bright,bug,cat,fish,meow,onc,orang,penni,saw,store,went
0,0.0,0.512612,0.512612,0.512612,0.0,0.0,0.258786,0.0,0.0,0.0,0.380417,0.0,0.0,0.0
1,0.0,0.45617,0.45617,0.45617,0.0,0.0,0.230292,0.0,0.0,0.45617,0.33853,0.0,0.0,0.0
2,0.578752,0.0,0.0,0.0,0.0,0.578752,0.292176,0.0,0.0,0.0,0.0,0.0,0.494698,0.0
3,0.303663,0.0,0.0,0.0,0.303663,0.0,0.153301,0.0,0.0,0.0,0.676058,0.365821,0.259561,0.365821
4,0.0,0.0,0.0,0.0,0.641958,0.0,0.162043,0.641958,0.386682,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.840166,0.282766,0.280055,0.0,0.280055,0.0,0.0,0.239382,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.562463,0.0,0.0,0.0,0.826823,0.0,0.0,0.0
