# Unit 4 Build Week - Med Cabinet

- Create a NLP model that returns the top three strain reccomendations based on what the user is looking for
  - Return output should be in json format
- Host the database in a PostgreSQL or SQLite
- Deploy a Heroku App for the front end web developers to connect to


### Load and clean data

# New Section

In [None]:
# Load file into Google Colab

from google.colab import files
uploaded = files.upload()

Saving cabinet_strain.csv to cabinet_strain.csv


In [None]:
# Imports

import pandas as pd
import numpy as np

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.neighbors import NearestNeighbors

In [None]:
# Read in CSV

data = pd.read_csv("cabinet_strain.csv")
print(data.shape)
data.head()

(1644, 8)


Unnamed: 0,strain_id,strain_name,strain_type,strain_rating,effects_profile,flavor_profile,strain_description,model_id
0,3535,1024,Sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,1.0
1,3534,100 OG,Hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,0.0
2,3536,13 Dawgs,Hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,
3,3537,24K Gold,Hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",2.0
4,3539,303 OG,Indica,4.2,"Relaxed,Happy,Euphoric,Uplifted,Giggly","Citrus,Pungent,Earthy",The indica-dominant 303 OG is a Colorado strai...,4.0


In [None]:
# Cleaning the data and changing nan values to unknowns and 0s.

#data = data.drop('model_id', axis=1)
data['strain_rating'] = data['strain_rating'].replace(np.nan, 0)
data['effects_profile'] = data['effects_profile'].replace(np.nan, 'Effects currently unknown')
data['flavor_profile'] = data['flavor_profile'].replace(np.nan, 'Flavor profile currently unavailable')
data['strain_description'] = data['strain_description'].replace(np.nan, 'Strain description currently unavailable')
data['strain_type'] = data['strain_type'].replace(np.nan, 'Strain type currently unknown')

In [None]:
# Unique Strain types

data['strain_type'] = data['strain_type'].str.capitalize()

data['strain_type'].unique()

array(['Sativa', 'Hybrid', 'Indica', 'Strain type currently unknown'],
      dtype=object)

In [None]:
# Unique ratings

data['strain_rating'].unique()

array([4.4, 4. , 4.2, 4.6, 4.5, 4.3, 4.7, 5. , 3.8, 4.8, 4.1, 0. , 3.4,
       3.7, 3.9, 4.9, 3.6, 2.8, 3.3, 3.5, 2. , 3. , 3.2])

In [None]:
# Check for remaining nan values

data.isnull().sum()

strain_id               0
strain_name             0
strain_type             0
strain_rating           0
effects_profile         0
flavor_profile          0
strain_description      0
model_id              771
dtype: int64

In [None]:
# Drop nan values

data = data.copy()

data = data.dropna()
data = data.reset_index(drop=True)
data.shape

(873, 8)

In [None]:
data.head()

Unnamed: 0,strain_id,strain_name,strain_type,strain_rating,effects_profile,flavor_profile,strain_description,model_id
0,3535,1024,Sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,1.0
1,3534,100 OG,Hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,0.0
2,3537,24K Gold,Hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",2.0
3,3539,303 OG,Indica,4.2,"Relaxed,Happy,Euphoric,Uplifted,Giggly","Citrus,Pungent,Earthy",The indica-dominant 303 OG is a Colorado strai...,4.0
4,3538,3 Kings,Hybrid,4.4,"Relaxed,Euphoric,Happy,Uplifted,Hungry","Earthy,Sweet,Pungent","The 3 Kings marijuana strain, a holy trinity o...",3.0


In [None]:
# Create a master profile feature

data['strain_profile'] = data['strain_type'] + ',' + data['effects_profile'] + ',' + data['flavor_profile']

In [None]:
# Save cleaned df to use in database

# data.to_csv('med_cabinet_cleaned.csv')

In [None]:
# Vectorizer object

nlp=English()
tokenizer = Tokenizer(nlp.vocab)
my_words = ['unavailable', 'unknown', 'profile', 'currently']
my_stop_words = text.ENGLISH_STOP_WORDS.union(my_words)

tf = TfidfVectorizer(stop_words=my_stop_words)

In [None]:
# Create a data-term matrix

dtm = tf.fit_transform(data['strain_profile'].values.astype('U'))
dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names())
print(dtm.shape)
dtm.head()

(873, 66)


Unnamed: 0,ammonia,apple,apricot,aroused,berry,blue,blueberry,butter,cheese,chemical,chestnut,citrus,coffee,creative,diesel,earthy,energetic,euphoric,flowery,focused,fruit,giggly,grape,grapefruit,happy,herbal,honey,hungry,hybrid,indica,lavender,lemon,lime,mango,menthol,mint,minty,nutty,orange,peach,pear,pepper,pine,pineapple,plum,pungent,relaxed,rose,sage,sativa,skunk,sleepy,spicy,strawberry,sweet,talkative,tar,tea,tingly,tobacco,tree,tropical,uplifted,vanilla,violet,woody
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.239243,0.0,0.0,0.245913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.112725,0.370503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.122189,0.0,0.628755,0.266651,0.0,0.0,0.370503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144485,0.0,0.0,0.304625
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.400891,0.0,0.37091,0.0,0.24917,0.38125,0.199618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.295318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.189436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.264071,0.0,0.0,0.0,0.511464,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.358774,0.0,0.0,0.0,0.222993,0.0,0.178647,0.0,0.0,0.0,0.0,0.0,0.0,0.156403,0.0,0.0,0.0,0.264292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.642586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.169534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.462041,0.0,0.0,0.0,0.0,0.0,0.0,0.200469,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41491,0.0,0.0,0.0,0.257884,0.0,0.206599,0.0,0.0,0.0,0.53603,0.0,0.0,0.180874,0.0,0.0,0.0,0.0,0.349031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.429662,0.196061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.231836,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.295194,0.0,0.236489,0.0,0.0,0.0,0.0,0.0,0.0,0.207043,0.0,0.0,0.480731,0.349865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.491825,0.224427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.312848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.265378,0.0,0.0,0.0


In [None]:
# Define and fit model

nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [None]:
# Test model

user01_input = ['strawberry']
user01_dense = tf.transform(user01_input)
_, user01_output = nn.kneighbors(user01_dense.todense())

user01_output

array([[731, 365, 723, 729, 722]])

In [None]:
# Print output

list_strains = []
for n in user01_output:
    for index in n:
        list_strains.append(index)

for n in list_strains:
    print(f"{data.loc[n,:]}\n")

strain_id                                                          5043
strain_name                                             Strawberry Kush
strain_type                                                      Hybrid
strain_rating                                                       4.2
effects_profile                  Relaxed,Happy,Sleepy,Uplifted,Euphoric
flavor_profile                                 Strawberry,Sweet,Pungent
strain_description    Strong and sweet, Strawberry Kush is one well-...
model_id                                                            885
strain_profile        Hybrid,Relaxed,Happy,Sleepy,Uplifted,Euphoric,...
Name: 731, dtype: object

strain_id                                                          4253
strain_name                                                 Gummy Bears
strain_type                                                      Hybrid
strain_rating                                                       4.2
effects_profile                  Relax

In [None]:
# # Create pickles of the model and the transformer for web deployment

# # Imports
# import pickle
# from sklearn.externals import joblib 

# # Save the model as a pickle file 
# joblib.dump(nn, 'nn02_model.pkl') 

# # Save the transformer as a pickle file
# joblib.dump(tf, 'tf_01.pkl') 

['nn02_model.pkl']