# Tools, technologies, & techniques featured in this notebook
- NLP preprocessing, clustering, 

In [11]:
import numpy as np
import pandas as pd
from numpy.linalg import svd
# import string

import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
# from nltk.stem.porter import PorterStemmer
# from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix

## Preprocessing functions and methods

In [12]:
wordnet = WordNetLemmatizer()
# porter = PorterStemmer()
# snowball = SnowballStemmer('english')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/salvir1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
titles_to_remove = ['ciambellone', 'ciambella','puff','croissant', 'croissants', 'crescent', 'brioche']

In [18]:
import re
def remove_punc(string:str) -> str:
    '''Given a string, removes all punctuation and returned punctuation-less string'''
    return re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", string)

In [19]:
def tokenize(str):
    '''
    Tokenize a str and return a tokenized list.
    '''
    return [word for word in word_tokenize(str)]

In [20]:
def lemmatize(doc):
    '''Takes in a doc and lemmatizes tokens in doc
    Parameters
    ----------
    doc: list of tokens
    
    Returns
    -------
    lemmatized tokens
    '''
    return [wordnet.lemmatize(tkn) for tkn in doc]

In [21]:
def rm_stop_words(doc, stops=set(stopwords.words('english'))):
    '''Takes in a doc and removes stop words
    Parameters
    ----------
    doc: list of tokens
    
    Returns
    -------
    Tokens with stop words removed
    '''
    return([w for w in doc if w not in stops])

In [22]:
def rm_title_words(doc, title_words):
    '''Takes in a doc and removes title words to allow algorithm to focus on other keywords
    Parameters
    ----------
    doc: list of tokens
    
    Returns
    -------
    Tokens with title words removed
    '''
    return([w for w in doc if w not in title_words])

In [23]:
def preprocess_corpus(content):
    '''
    Add docstring. Make flexible to allow for doing, or not doing, preprocessing functions. 
    Parameters
    ----------
    content (str): a collection of strings
    Returns
    -------
    A list of lists: each list contains a tokenized version of the original string
    '''
    preprocessed = []
    for i in range(len(content)):
        step_1 = remove_punc(content[i].lower())
        step_2 = tokenize(step_1)
        step_3 = lemmatize(step_2)
        step_4 = rm_stop_words(step_3)
        step_5 = rm_title_words(step_4, titles_to_remove)
        preprocessed.append(step_5)
    return preprocessed

In [24]:
# loading sample data to check functions
df_csnt = pd.read_csv('data/us_croissant.csv')
df_ciambellone = pd.read_csv('data/us_ciambellone.csv')
df_puff = pd.read_csv('data/us_puff.csv')
df_brioche = pd.read_csv('data/us_brioche.csv')
df = pd.concat([df_csnt, df_ciambellone, df_puff, df_brioche], axis=0, ignore_index=True) 
df.columns = ['drop','url','instructions','recipe type']
df.drop('drop', axis = 1)
corpus = df['instructions']
y = df['recipe type']

In [27]:
corpus

0      Watch the video above and use the step-by-step...
1      For the dough: Put the eggs and water in a lar...
2      YieldMakes 24 pastries\nActive Time2 hr\nTotal...
3      It’s all about the layers…\nKlik hier voor Ned...
4      Combine all of the dough ingredients in the bo...
                             ...                        
265    In a medium mixing bowl, whisk together the wa...
266    Combine 1/3 cup of milk and 1 tablespoon of fl...
267    In a small bowl, whisk the yeast with the butt...
268    Warm the milk, transfer to a bowl and crumble ...
269    In a glass measuring cup, combine one cup warm...
Name: instructions, Length: 270, dtype: object

### Preprocessing--data load and function calls

In [28]:
cleaned_tokenized = preprocess_corpus(corpus) # cleaned and tokenized
str_cleaned_tokenized = [" ".join(x) for x in cleaned_tokenized] # string version of cleaned and tokenized 

## Processing

In [29]:
# 'Bag of words function'
vect = CountVectorizer(max_features=500)
word_counts = vect.fit_transform(str_cleaned_tokenized)
word_counts

<270x500 sparse matrix of type '<class 'numpy.int64'>'
	with 29000 stored elements in Compressed Sparse Row format>

In [30]:
vect.get_feature_names()

['10',
 '12',
 '13',
 '14',
 '15',
 '16',
 '18',
 '20',
 '23',
 '24',
 '25',
 '30',
 '34',
 '35',
 '350',
 '40',
 '400',
 '45',
 '60',
 '90',
 'add',
 'adding',
 'air',
 'allow',
 'allpurpose',
 'almond',
 'almost',
 'along',
 'also',
 'always',
 'amount',
 'another',
 'apart',
 'apple',
 'approximately',
 'around',
 'arrange',
 'aside',
 'attachment',
 'away',
 'back',
 'bag',
 'bake',
 'baked',
 'baker',
 'baking',
 'ball',
 'base',
 'batch',
 'batter',
 'beat',
 'become',
 'begin',
 'best',
 'better',
 'big',
 'bit',
 'block',
 'book',
 'bottom',
 'bowl',
 'bread',
 'breakfast',
 'bring',
 'brown',
 'brush',
 'bundt',
 'business',
 'butter',
 'buttery',
 'cake',
 'called',
 'cant',
 'carefully',
 'center',
 'check',
 'cheese',
 'chill',
 'chilled',
 'chocolate',
 'classic',
 'clean',
 'cling',
 'close',
 'cm',
 'coat',
 'coconut',
 'cold',
 'combine',
 'combined',
 'come',
 'completely',
 'container',
 'continue',
 'cook',
 'cooking',
 'cool',
 'cooling',
 'corner',
 'could',
 'coun

In [162]:
tfidfvect = TfidfVectorizer(max_features=500)
tfidf_vectorized = tfidfvect.fit_transform(str_cleaned_tokenized)
tfidf_vectorized.toarray()

array([[0.        , 0.01336513, 0.02400422, ..., 0.01998364, 0.        ,
        0.        ],
       [0.08770108, 0.0568134 , 0.05101939, ..., 0.        , 0.05036798,
        0.        ],
       [0.11471624, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.10225533, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.1151328 , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

## Multinomial Naive Bayes

- One of two common naive Bayes variants used in text classification
- Usually done on word vector counts, but can also be done on tf-idf vectors
- The distribution is parametrized by theta_y vectors for each class
- theta_y is estimated by a smoothed version of maximum likelihood, i.e. relative frequency counting

In [59]:
X_train, X_test, y_train, y_test = train_test_split(word_counts, y, random_state=3)

In [60]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

scoreboard = zip(clf.predict_proba(X_test), clf.predict(X_test), y_test)
spc=''
print(f'\nMean accuracy: {clf.score(X_test, y_test)}\n')

print(f'     Prediction:          Actual                Prob brioche     Prob ciam     Prob crsnt      Prob puff ')
print(f'     ----------           ------                ------------     ---------     ----------      ---------')
for i in scoreboard:
    if i[2] != i[1]:
        print(f'P:  {i[1]:<15} A:  {i[2]:<18} Probs:   {i[0][0]:.3f}{spc:<10}{i[0][1]:.3f}{spc:<10}{i[0][2]:.3f}{spc:<10}{i[0][3]:.3f}')



Mean accuracy: 0.8676470588235294

     Prediction:          Actual                Prob brioche     Prob ciam     Prob crsnt      Prob puff 
     ----------           ------                ------------     ---------     ----------      ---------
P:  brioche         A:  ciambellone        Probs:   0.998          0.000          0.002          0.000
P:  brioche         A:  puff_pastry        Probs:   1.000          0.000          0.000          0.000
P:  brioche         A:  croissant          Probs:   0.988          0.005          0.007          0.000
P:  brioche         A:  croissant          Probs:   1.000          0.000          0.000          0.000
P:  croissant       A:  puff_pastry        Probs:   0.463          0.000          0.537          0.000
P:  brioche         A:  ciambellone        Probs:   1.000          0.000          0.000          0.000
P:  ciambellone     A:  puff_pastry        Probs:   0.000          1.000          0.000          0.000
P:  brioche         A:  puff_pas

In [62]:
print(f'\nConfusion matrix. Predictions in rows. Actuals in columns\n')

print(f'\t\t     Br  Cm  Cs  Pf')
print(f'\t\t     --  --  --  --')
print(f'\t\tBrio {confusion_matrix( clf.predict(X_test), y_test)[0]}')
print(f'\t\tCiam {confusion_matrix( clf.predict(X_test), y_test)[1]}')
print(f'\t\tCsnt {confusion_matrix( clf.predict(X_test), y_test)[2]}')
print(f'\t\tPuff {confusion_matrix( clf.predict(X_test), y_test)[3]}')

print(f'\nMean accuracy: {clf.score(X_test, y_test)}\n')



Confusion matrix. Predictions in rows. Actuals in columns

		     Br  Cm  Cs  Pf
		     --  --  --  --
		Brio [12  2  2  2]
		Ciam [ 0 11  0  1]
		Csnt [ 1  0 20  1]
		Puff [ 0  0  0 16]

Mean accuracy: 0.8676470588235294



In [63]:
kf = KFold(n_splits=5, shuffle=True)  # almost always use shuffle=True
fold_scores = []

for train, test in kf.split(word_counts):
    model = MultinomialNB()
    model.fit(word_counts[train], y[train])
    fold_scores.append(model.score(word_counts[test], y[test]))
    
print(np.mean(fold_scores))

0.8814814814814815


## Word2Vec

In [76]:
from gensim.models import Word2Vec

In [78]:
corpus

0      Watch the video above and use the step-by-step...
1      For the dough: Put the eggs and water in a lar...
2      YieldMakes 24 pastries\nActive Time2 hr\nTotal...
3      It’s all about the layers…\nKlik hier voor Ned...
4      Combine all of the dough ingredients in the bo...
                             ...                        
265    In a medium mixing bowl, whisk together the wa...
266    Combine 1/3 cup of milk and 1 tablespoon of fl...
267    In a small bowl, whisk the yeast with the butt...
268    Warm the milk, transfer to a bowl and crumble ...
269    In a glass measuring cup, combine one cup warm...
Name: instructions, Length: 270, dtype: object

In [79]:
word2vec = Word2Vec(cleaned_tokenized)
#A value of 2 for min_count specifies to include only those words in the Word2Vec model that appear at least twice in the corpus.

In [82]:
vocabulary = word2vec.wv.vocab
#vocabulary = word2vec objects containing words appearing two or more times in the corpus

In [95]:
word2vec.wv.most_similar('bowl')
# method that returns the most similar words to the given word, not association, but similarity supposedly

[('stand', 0.9823739528656006),
 ('large', 0.9782915115356445),
 ('mixer', 0.9734643697738647),
 ('speed', 0.9653637409210205),
 ('hook', 0.9618068933486938),
 ('medium', 0.9615850448608398),
 ('fitted', 0.9613025188446045),
 ('beat', 0.9564694166183472),
 ('mixing', 0.9548104405403137),
 ('whisk', 0.9545235633850098)]

## Clustering with K Means

In [41]:
clusters = 8
kmeans = KMeans(n_clusters=clusters, 
                random_state=0).fit(tfidf_vectorized)

- Investigate the clusters  

> - Investigate the 'centroids' to find out what "topics" Kmeans has discovered by mapping these vectors back into the 'word space'.  Think of each feature/dimension of the centroid vector as representing the "average" article or the average occurrences of words for that cluster.
   
> - Find the features/dimensions with the greatest representation in the centroid.  Print out the top ten words for each centroid.


In [42]:
def Sort(sub_li): 
    return sorted(sub_li, key = lambda x: x[0], reverse=True)

def get_word(centroid):
    return [x[1] for x in centroid]

for k in range(3):
    matched = zip(kmeans.cluster_centers_[k], tfidfvect.get_feature_names())
    match = Sort(list(matched))
    print(' '.join(get_word(match[:12])), '\n')

pastry dough puff butter flour roll fold make recipe water cold rolling 

dough butter flour wrap roll rectangle fold plastic work water third surface 

batter pan cake sugar bundt minute flour oil beat whisk preheat egg 



For heirarchical clustering methods, see 819 am clustering assignment

## Cosine similarity
- Unsupervised learning

- Use the cosine similarity to compare similarity between documents.

- sklearn's [linear_kernel](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.linear_kernel.html) (computes dot product) can be used on tfidf to compute the cosine similarity since rows are normalized.*

- Here's a page on cosine similarity from [sklearn documentation](http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity) and a relevant [stack overflow post](http://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity).

- *The stack overflow post is helpful. It provides instruction over how to slice the tfidf and then how to apply cosine similarity between one doc and all of the rest.*

In [43]:
cosine_similarities = linear_kernel(tfidf_vectorized[1:2], tfidf_vectorized[1:500]).flatten() # This is comparing cs for article #2 and the next 500.
cosine_similarities

array([1.        , 0.42532042, 0.51480273, 0.63426294, 0.58131785,
       0.49458012, 0.41182741, 0.67212269, 0.39923599, 0.43034994,
       0.59669337, 0.55242081, 0.39986244, 0.59749017, 0.59781199,
       0.61531608, 0.56630834, 0.48291278, 0.51860656, 0.62124648,
       0.55917093, 0.64123624, 0.62018231, 0.21397562, 0.44491723,
       0.66974737, 0.51849926, 0.60090852, 0.56011136, 0.5395997 ,
       0.33038423, 0.32047009, 0.61687334, 0.22388983, 0.59963174,
       0.46037651, 0.54202385, 0.50374599, 0.54340846, 0.55421876,
       0.26610053, 0.56702984, 0.54851483, 0.29379981, 0.60718011,
       0.52875962, 0.16581074, 0.19842094, 0.64240339, 0.53090086,
       0.33606642, 0.22834341, 0.54317563, 0.61562137, 0.19273947,
       0.20738625, 0.43520125, 0.208638  , 0.22270445, 0.61583263,
       0.18295336, 0.56707674, 0.51721626, 0.54509692, 0.60539148,
       0.6357192 , 0.50188313, 0.15531057, 0.53276468, 0.53276468,
       0.1737135 , 0.32499931, 0.1876329 , 0.15935641, 0.14880

In [44]:
related_docs_indices = cosine_similarities.argsort()[:-5:-1] # This identifies the index of the top 5 most similar.
print(related_docs_indices)

cosine_similarities[related_docs_indices] # and their related cs

[  0   7  25 131]


array([1.        , 0.67212269, 0.66974737, 0.64343882])

In [None]:
articles.iloc[411] # Going step by step pulling up the most similar articles by index

## Decompositions NMF (and SVD)
- Unsupervised learning
- Good for situations when there's some potentially valid grouping to both rows and columns, such as putting Joe and Sam in the same group because they like similar movies (as opposed to traditional supervised models where there are features and targets)
- See 820pm solution to NMF for good soft classification and test of classification


## Naive Bayes
- Supervised learning method to assign class probabilities to a document
- See 818PM NLP-pipeline-programming-net-example for using sklearn Naive Bayes classifier. See also 818PM lecture on text classification. Solutions to assignment contain a number of useful naive Bayes python functions