In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)

import fuzzymatcher
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from fuzzymatcher import link_table, fuzzy_left_join

### Clean CSV Data

In [2]:
# descriptors of cannabis strains
can = pd.read_csv('/Users/stephaniekendall/Desktop/Flatiron/projects/Cannabis-Predictions/CSV Files/cannabis.csv', error_bad_lines=False)

# kushy csv
strains = pd.read_csv('/Users/stephaniekendall/Desktop/Flatiron/projects/Cannabis-Predictions/CSV Files/strains-kushy_api.2017-11-14.csv')

b'Skipping line 1333: expected 6 fields, saw 10\nSkipping line 1655: expected 6 fields, saw 10\nSkipping line 1656: expected 6 fields, saw 10\nSkipping line 1657: expected 6 fields, saw 10\nSkipping line 2102: expected 6 fields, saw 10\n'


In [3]:
can = can.dropna()
can = can.drop(columns=['description'])

# drop columns with only zeros or NaN
strains = strains.drop(columns=['slug','image','thca','thcv','cbda','cbdv'])

# drop columns with over 80% NaNs
strains = strains.drop(columns=['description','crosses','ailment','flavor','location','terpenes','breeder'])

# drop last 7 columns of empty cbd values
strains = strains.iloc[:, 0:8]

# replace - with whitespace in name 
strains['name'] = strains['name'].str.replace('-',' ')

strains = strains.dropna()

# create new dataframe for effects of strains
effects = strains[['id','name','type','effects']]
strains = strains.drop(columns=['effects'])


can['strain'] = can['strain'].str.replace('-',' ')

can['strain'] = can['strain'].str.title()

In [4]:
def match_name(name, list_names, min_score=0):
    # -1 score incase we don't get any matches
    max_score = -1
    # Returning empty name for no match as well
    max_name = ""
    # Iternating over all names in the other
    for name2 in list_names:
        #Finding fuzzy match score
        score = fuzz.ratio(name, name2)
        # Checking if we are above our threshold and have a better score
        if (score > min_score) & (score > max_score):
            max_name = name2
            max_score = score
    return (max_name, max_score)

In [5]:
# List for dicts for easy dataframe creation
dict_list = []
# iterating over df with more strains
for name in can.strain:
    # Use our method to find best match, we can set a threshold here
    match = match_name(name, effects.name, 90)
    
    # New dict for storing data
    dict_ = {}
    dict_.update({"strain" : name})
    dict_.update({"name" : match[0]})
    dict_list.append(dict_)
    
merge_table = pd.DataFrame(dict_list)
# Display results
merge_table

Unnamed: 0,name,strain
0,,100 Og
1,,98 White Widow
2,1024,1024
3,,13 Dawgs
4,,24K Gold
5,,3 Bears Og
6,,3 Kings
7,,303 Og
8,,3D Cbd
9,3X Crazy,3X Crazy


In [6]:
df = pd.merge(merge_table, can, left_on='strain', right_on='strain')

In [7]:
df = df.drop(columns=['name'])
df1 = df['effects'].str.get_dummies(sep=',')
df1 = pd.merge(df, df1, left_index=True, right_index=True)
df1.head()

Unnamed: 0,strain,type,rating,effects,taste,Aroused,Creative,Dry,Energetic,Euphoric,Focused,Giggly,Happy,Hungry,Mouth,None,Relaxed,Sleepy,Talkative,Tingly,Uplifted
0,100 Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",0,1,0,1,1,0,0,0,0,0,0,1,0,0,1,0
1,98 White Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1
3,13 Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,1
4,24K Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange",0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1


In [8]:
tastes = df['taste'].str.get_dummies(sep=',')
df2 = pd.merge(df, tastes, left_index=True, right_index=True)
df1 = df1.drop(columns=['effects','rating','taste'])
df2 = df2.drop(columns=['effects','rating','taste'])
df2.head()

Unnamed: 0,strain,type,Ammonia,Apple,Apricot,Berry,Blue,Blueberry,Butter,Cheese,Chemical,Chestnut,Citrus,Coffee,Diesel,Earthy,Flowery,Fruit,Grape,Grapefruit,Honey,Lavender,Lemon,Lime,Mango,Menthol,Mint,Minty,None,Nutty,Orange,Peach,Pear,Pepper,Pine,Pineapple,Plum,Pungent,Rose,Sage,Skunk,Spicy/Herbal,Strawberry,Sweet,Tar,Tea,Tobacco,Tree,Tropical,Vanilla,Violet,Woody
0,100 Og,hybrid,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,98 White Widow,hybrid,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1024,sativa,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1
3,13 Dawgs,hybrid,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,24K Gold,hybrid,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
import matplotlib.pyplot as plt
from matplotlib import venn2

ImportError: cannot import name 'venn2' from 'matplotlib' (/Users/stephaniekendall/anaconda3/lib/python3.7/site-packages/matplotlib/__init__.py)

### Clean API Data

In [10]:
import requests

In [11]:
response = requests.get('http://strainapi.evanbusse.com/oJPpcAq/strains/search/all')
response.status_code

200

In [12]:
data = response.json()
df3 = pd.DataFrame.from_dict(data)
df3 = df3.transpose()
df3 = pd.concat([df3.drop(['effects'], axis=1), df3['effects'].apply(pd.Series)], axis=1)
df3.reset_index(inplace=True)
df3.drop(columns=['id'], inplace=True)
df3 = df3.rename(columns={'index':'name'})
df3.head()

Unnamed: 0,index,flavors,race,positive,negative,medical
0,Afpak,"[Earthy, Chemical, Pine]",hybrid,"[Relaxed, Hungry, Happy, Sleepy]",[Dizzy],"[Depression, Insomnia, Pain, Stress, Lack of A..."
1,African,"[Spicy/Herbal, Pungent, Earthy]",sativa,"[Euphoric, Happy, Creative, Energetic, Talkative]",[Dry Mouth],"[Depression, Pain, Stress, Lack of Appetite, N..."
2,Afternoon Delight,"[Pepper, Flowery, Pine]",hybrid,"[Relaxed, Hungry, Euphoric, Uplifted, Tingly]","[Dizzy, Dry Mouth, Paranoid]","[Depression, Insomnia, Pain, Stress, Cramps, H..."
3,Afwreck,"[Pine, Earthy, Flowery]",hybrid,"[Relaxed, Happy, Creative, Uplifted, Sleepy]","[Dizzy, Dry Mouth, Paranoid, Dry Eyes]","[Pain, Stress, Headache, Fatigue, Headaches, M..."
4,Agent Orange,"[Citrus, Orange, Sweet]",hybrid,"[Relaxed, Euphoric, Happy, Energetic, Uplifted]","[Dizzy, Dry Mouth, Paranoid, Dry Eyes]","[Depression, Pain, Stress, Nausea, Headache, H..."


In [23]:
len(df3)

1970

In [25]:
# List for dicts for easy dataframe creation
dict_list1 = []
# iterating over df with more strains
for name in df1.strain:
    # Use our method to find best match, we can set a threshold here
    match = match_name(name, df3.name, 90)
    
    # New dict for storing data
    dict_ = {}
    dict_.update({"strain" : name})
    dict_.update({"name" : match[0]})
    dict_list1.append(dict_)
    
merge_table1 = pd.DataFrame(dict_list)
# Display results
merge_table1

Unnamed: 0,name,strain
0,,100 Og
1,,98 White Widow
2,1024,1024
3,,13 Dawgs
4,,24K Gold
5,,3 Bears Og
6,,3 Kings
7,,303 Og
8,,3D Cbd
9,3X Crazy,3X Crazy


In [52]:
df3 = df3.rename(columns={'name':'names'})
df3.head()

Unnamed: 0,names,flavors,race,positive,negative,medical
0,Afpak,"[Earthy, Chemical, Pine]",hybrid,"[Relaxed, Hungry, Happy, Sleepy]",[Dizzy],"[Depression, Insomnia, Pain, Stress, Lack of A..."
1,African,"[Spicy/Herbal, Pungent, Earthy]",sativa,"[Euphoric, Happy, Creative, Energetic, Talkative]",[Dry Mouth],"[Depression, Pain, Stress, Lack of Appetite, N..."
2,Afternoon Delight,"[Pepper, Flowery, Pine]",hybrid,"[Relaxed, Hungry, Euphoric, Uplifted, Tingly]","[Dizzy, Dry Mouth, Paranoid]","[Depression, Insomnia, Pain, Stress, Cramps, H..."
3,Afwreck,"[Pine, Earthy, Flowery]",hybrid,"[Relaxed, Happy, Creative, Uplifted, Sleepy]","[Dizzy, Dry Mouth, Paranoid, Dry Eyes]","[Pain, Stress, Headache, Fatigue, Headaches, M..."
4,Agent Orange,"[Citrus, Orange, Sweet]",hybrid,"[Relaxed, Euphoric, Happy, Energetic, Uplifted]","[Dizzy, Dry Mouth, Paranoid, Dry Eyes]","[Depression, Pain, Stress, Nausea, Headache, H..."


In [30]:
len(merge_table1)

2273

In [48]:
df4 = df.strain.map(merge_table1.)
df4.head()

Unnamed: 0,strain_x,type,rating,effects,taste,name,strain_y
0,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024,1024
1,3X Crazy,indica,4.4,"Relaxed,Tingly,Happy,Euphoric,Uplifted","Earthy,Grape,Sweet",3X Crazy,3X Crazy
2,8 Ball Kush,indica,4.3,"Relaxed,Uplifted,Happy,Creative,Euphoric","Citrus,Earthy,Pine",8 Ball Kush,8 Ball Kush
3,9 Pound Hammer,indica,4.5,"Relaxed,Sleepy,Euphoric,Happy,Hungry","Earthy,Sweet,Berry",9 Pound Hammer,9 Pound Hammer
4,91 Krypt,indica,4.7,"Relaxed,Euphoric,Happy,Hungry,Uplifted","Earthy,Pungent,Berry",91 Krypt,91 Krypt


In [49]:
len(df4)

804

In [19]:
df4 = df4.drop(columns=['name_x','name_y'])
df4.sample(5)

Unnamed: 0,strain,flavors,race,positive,negative,medical
854,Locomotion,"[Earthy, Pungent, Sweet]",indica,"[Relaxed, Euphoric, Happy, Uplifted]",[Dry Mouth],"[Depression, Insomnia, Pain, Stress, Nausea]"
234,Boss Hogg,"[Earthy, Woody, Lemon]",hybrid,"[Relaxed, Hungry, Happy, Creative, Uplifted]","[Dry Mouth, Dry Eyes]","[Insomnia, Pain, Stress, Lack of Appetite, Mus..."
169,Blue Galaxy,"[Sweet, Berry, Blueberry]",hybrid,"[Relaxed, Euphoric, Happy, Uplifted]",[],"[Depression, Pain, Stress, Headaches]"
277,Cannalope Kush,"[Sweet, Tropical, Pine]",hybrid,"[Euphoric, Happy, Energetic, Uplifted, Focused]","[Dizzy, Dry Mouth, Paranoid, Dry Eyes]","[Depression, Pain, Stress, Fatigue, Headaches]"
1153,Ripped Bubba,"[Sweet, Citrus, Lemon]",hybrid,"[Relaxed, Hungry, Euphoric, Happy]","[Dizzy, Dry Mouth, Dry Eyes, Anxious]","[Insomnia, Pain, Stress, Lack of Appetite, Hea..."


### Evaluations