<a href="https://colab.research.google.com/github/pragmatizt/build-week-med-cabinet/blob/master/nlp_model_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!python -m spacy download en_core_web_md
# https://stackoverflow.com/questions/56927602/unable-to-load-the-spacy-model-en-core-web-lg-on-google-colab

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [0]:
# !pip install -U spacy[lookups]

In [0]:

# Path and File Libraries
import os
import pickle

# Data Transformation Libraries
import pandas as pd
import numpy as np
import spacy

from spacy.tokenizer import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# joblib
# from sklearn.externals import joblib
# from joblib import dump, load


In [0]:
# from spacy.lemmatizer import Lemmatizer

## load data

In [0]:
url= 'df_merged.csv'
df = pd.read_csv(url)

In [6]:
df.head()

Unnamed: 0,Strain,Type,Rating,Description,flavors,positive,negative,medical
0,Afpak,hybrid,4.2,"Afpak, named for its direct Afghani and Pakist...","['Earthy', 'Chemical', 'Pine', 'Spicy/Herbal']","['Relaxed', 'Hungry', 'Happy', 'Sleepy', 'Crea...",['Dizzy'],"['Depression', 'Insomnia', 'Pain', 'Stress', '..."
1,African,sativa,3.9,African refers to the indigenous varieties of ...,"['Spicy/Herbal', 'Pungent', 'Earthy', 'Pepper']","['Euphoric', 'Happy', 'Creative', 'Energetic',...",['Dry Mouth'],"['Depression', 'Pain', 'Stress', 'Lack of Appe..."
2,Afternoon Delight,hybrid,4.8,"Afternoon Delight, created by Colorado Seed In...","['Pepper', 'Flowery', 'Pine', 'Pungent', 'Citr...","['Relaxed', 'Hungry', 'Euphoric', 'Uplifted', ...","['Dizzy', 'Dry Mouth', 'Paranoid']","['Depression', 'Insomnia', 'Pain', 'Stress', '..."
3,Afwreck,hybrid,4.2,Afwreck is a hybrid cross of Afghani and Train...,"['Pine', 'Earthy', 'Flowery', 'Pungent']","['Relaxed', 'Happy', 'Creative', 'Uplifted', '...","['Dizzy', 'Dry Mouth', 'Paranoid', 'Dry Eyes']","['Pain', 'Stress', 'Headache', 'Fatigue', 'Hea..."
4,Agent Orange,hybrid,4.2,Don’t let the name scare you! The only herbici...,"['Citrus', 'Orange', 'Sweet', 'Earthy']","['Relaxed', 'Euphoric', 'Happy', 'Energetic', ...","['Dizzy', 'Dry Mouth', 'Paranoid', 'Dry Eyes']","['Depression', 'Pain', 'Stress', 'Nausea', 'He..."


In [0]:
# df['combined_text'] = df.Strain + ' ' + df.Type + ' ' + df.flavors + ' ' + df.Description + ' ' + df.positive + ' ' +    df.negative + ' ' + df.medical


In [0]:
df['combined_text'] = df.Strain + ' ' + df.Type + ' ' + df.flavors + ' ' + df.Description + ' ' + df.positive + ' ' +    df.negative + ' ' + df.medical
# Removing punctuations from our string
df["combined_text"] = df['combined_text'].str.replace('[^\w\s]',' ')
        
# Creating an index
df.reset_index(level=0, inplace=True)
        
for desc in df['combined_text']:
    if desc == 'None':
       desc = np.nan
        
df = df.dropna()

In [8]:
df.shape

(1473, 10)

In [9]:
df['combined_text'].head()

0    Afpak hybrid   Earthy    Chemical    Pine    S...
1    African sativa   Spicy Herbal    Pungent    Ea...
2    Afternoon Delight hybrid   Pepper    Flowery  ...
3    Afwreck hybrid   Pine    Earthy    Flowery    ...
4    Agent Orange hybrid   Citrus    Orange    Swee...
Name: combined_text, dtype: object

## split data as features and target

In [0]:
# We set our features as description, and target as strain.  
# Create a mass text.

features = ['combined_text'] # expanding the features medical + flavors
target = 'Strain'

X = df[features]
y = df[[target]]

In [0]:
#Create the nlp object
nlp = spacy.load("en_core_web_md")

# create tokenizer object
tokenizer = Tokenizer(nlp.vocab)

In [0]:
def tokenize(doc):
        """Return the tokens"""
        return [token.text for token in tokenizer(doc)]

In [0]:
def get_lemmas(text):
        """Return the Lemmas"""
        lemmas = []
        doc = nlp(text)
    
        for token in doc: 
            if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
                lemmas.append(token.lemma_)
    
        return lemmas
    

## transform and fit

In [0]:
text = df["combined_text"]

# Instantiate vectorizer object
tfidf = TfidfVectorizer(tokenizer=get_lemmas, min_df=0.025, max_df=.98, ngram_range=(1,2))

# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(text) # Similiar to fit_predict

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [15]:
# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

(1473, 662)


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,depression,depression.1,dizzy,dry,insomnia,pain,anxious,arouse,berry,blueberry,citrus,cramp,creative,diesel,dry.1,earthy,energetic,euphoric,eye,fatigue,flowery,focus,giggly,grape,happy,headache,hungry,inflammation,insomnia.1,lack,lemon,lime,muscle,nausea,orange,pain.1,paranoid,...,tingly,trainwreck,treat,trichome,trichome.1,tropical,tropical.1,tropical.2,true,typically,undertone,unique,uplift,uplift.1,uplift.2,uplift.3,uplifted,uplifted.1,uplifting,uplifting.1,use,user,vanilla,variety,way,week,week.1,white,white widow,widow,win,woody,woody.1,woody.2,x,yield,yield.1,Unnamed: 79,euphoric.1,relaxed
0,0.087507,0.0,0.0,0.0,0.051307,0.067999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074108,0.0,0.0,0.040189,0.0,0.083002,0.0,0.076247,0.074316,0.0,0.0,0.0,0.0,0.0,0.049179,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.151828,0.0,0.0,0.0,0.059346,0.0,0.084848,0.0,0.0,0.0,0.135955,0.0,0.0,0.0,0.0,0.073015,0.0,0.0,0.083382,0.082855,0.052612,0.0,0.0,0.0,0.0,0.0,0.0,0.046486,0.070092,0.0,0.0,0.0,0.085961,0.0,0.0,0.0,0.108244,0.0,0.056886,0.0,...,0.161027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.09103,0.0,0.0,0.0,0.053373,0.070737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.097718,0.136761,0.065665,0.0,0.057231,0.0,0.0,0.047316,0.0,0.0,0.109708,0.0,0.0,0.0,0.0,0.063037,0.086344,0.0,0.079317,0.0,0.0,0.0,0.0,0.0,0.0,0.05116,0.075956,...,0.0,0.0,0.0,0.0,0.0,0.115572,0.15857,0.0,0.0,0.0,0.0,0.0,0.055111,0.068088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10122,0.0,0.112527
3,0.113814,0.0,0.0,0.0,0.0,0.088442,0.0,0.0,0.193866,0.0,0.0,0.0,0.0,0.0,0.0,0.082101,0.0,0.14311,0.093759,0.0,0.059159,0.0,0.096658,0.137168,0.0,0.0,0.0,0.052271,0.15763,0.0,0.0,0.0,0.0,0.0,0.0,0.129715,0.0,0.0,0.0,0.094967,...,0.0,0.215663,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068905,0.08513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.076108,0.0,0.0,0.0,0.044624,0.059142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095698,0.062697,0.062301,0.03956,0.0,0.0,0.0,0.0,0.0,0.0,0.034954,0.105408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.081391,0.139418,0.042773,0.063505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046077,0.0,0.093177,0.0,0.0,0.0,0.187858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Fit on TF-IDF Vectors
size = 5
nn_ball_tree  = NearestNeighbors(n_neighbors=size, algorithm='ball_tree') # We use the ball_tree algorithm here. 
nn_ball_tree.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [0]:
user_input = ["""A sweet, citrusy flavor that is euphoric"""]

vec_user_input = tfidf.transform(user_input)
dist, strain_index = nn_ball_tree.kneighbors(vec_user_input.todense())

In [18]:
print(strain_index)

[[1288 1298  840 1300   78]]


In [19]:
print(df[['Strain','Type','flavors','medical']].iloc[1288])
print(df[['Description']].iloc[1288])
print(df[['Strain','Type','flavors','medical']].iloc[1298])
print(df[['Description']].iloc[1298])
print(df[['Strain','Type','flavors','medical']].iloc[840])
print(df[['Description']].iloc[840])
print(df[['Strain','Type','flavors','medical']].iloc[1300])
print(df[['Description']].iloc[1300])
print(df[['Strain','Type','flavors','medical']].iloc[78])
print(df[['Description']].iloc[78])

Strain                                     Sweet Berry
Type                                            hybrid
flavors        ['Berry', 'Sweet', 'Blueberry', 'Sage']
medical    ['Pain', 'Stress', 'Fatigue', 'Spasticity']
Name: 1308, dtype: object
Description    Sweet Berry is a hybrid cross between two flav...
Name: 1308, dtype: object
Strain                                       Sweet Lafayette
Type                                                  indica
flavors             ['Chestnut', 'Citrus', 'Earthy', 'Pine']
medical    ['Depression', 'Insomnia', 'Pain', 'Stress', '...
Name: 1318, dtype: object
Description    Sweet Lafayette is an 80% indica strain with m...
Name: 1318, dtype: object
Strain                                            Lemonberry
Type                                                  hybrid
flavors            ['Lemon', 'Blueberry', 'Citrus', 'Sweet']
medical    ['Depression', 'Pain', 'Stress', 'Headache', '...
Name: 849, dtype: object
Description    Dabney Blue and Le

## Experimenting and comparison 
Comparing it with KD Tree
Everything beyond this point is not reflected on the API.  It is merely used as a comparison within this notebook.

In [21]:
# Fit on TF-IDF Vectors
size = 5
nn_kd_tree  = NearestNeighbors(n_neighbors=size, algorithm='kd_tree') # Changed the algorithm to kd_tree here.
nn_kd_tree.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [0]:
user_input_kd = ["""A sweet, citrusy flavor that is euphoric"""]

vec_user_input_kd = tfidf.transform(user_input_kd)
dist, strain_index_kd = nn_kd_tree.kneighbors(vec_user_input_kd.todense())

In [23]:
print(strain_index_kd)

[[1288 1298  840 1300   78]]


In [24]:
print(df[['Strain','Type','flavors','medical']].iloc[1288])
print(df[['Description']].iloc[1288])
print(df[['Strain','Type','flavors','medical']].iloc[1298])
print(df[['Description']].iloc[1298])
print(df[['Strain','Type','flavors','medical']].iloc[840])
print(df[['Description']].iloc[840])
print(df[['Strain','Type','flavors','medical']].iloc[1300])
print(df[['Description']].iloc[1300])
print(df[['Strain','Type','flavors','medical']].iloc[78])
print(df[['Description']].iloc[78])

Strain                                     Sweet Berry
Type                                            hybrid
flavors        ['Berry', 'Sweet', 'Blueberry', 'Sage']
medical    ['Pain', 'Stress', 'Fatigue', 'Spasticity']
Name: 1308, dtype: object
Description    Sweet Berry is a hybrid cross between two flav...
Name: 1308, dtype: object
Strain                                       Sweet Lafayette
Type                                                  indica
flavors             ['Chestnut', 'Citrus', 'Earthy', 'Pine']
medical    ['Depression', 'Insomnia', 'Pain', 'Stress', '...
Name: 1318, dtype: object
Description    Sweet Lafayette is an 80% indica strain with m...
Name: 1318, dtype: object
Strain                                            Lemonberry
Type                                                  hybrid
flavors            ['Lemon', 'Blueberry', 'Citrus', 'Sweet']
medical    ['Depression', 'Pain', 'Stress', 'Headache', '...
Name: 849, dtype: object
Description    Dabney Blue and Le

**Reference Links:** Performance Evaluation: Ball-Tree and KD-Tree in the context of MST, https://arxiv.org/ftp/arxiv/papers/1210/1210.6122.pdf

After reading the article, I learned that the biggest differentiator between Ball Tree and KD Tree algorithms was speed. Is it possible that the dataset we used was simply too small to make a difference in the results?  

"*In this paper, authors compared kd-tree and ball-tree based dual tree Boruvka
algorithm for finding Euclidean Minimum Spanning Tree (EMST). For finding
efficient EMST, authors adopted dual tree algorithm and experimented on a variety of
real time and synthetic datasets of various dimensions. From the experimental
observation, authors conclude that the kd-tree performs faster than the ball-tree for
not only constructing the tree and also for solving the EMST problem. Moreover, the
kd-tree based dual tree Boruvka is giving good results than the ball-tree based dualtree Boruvka.* "
