In [40]:
import pandas as pd
pd.set_option('display.max_columns', 500)

import fuzzymatcher
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from fuzzymatcher import link_table, fuzzy_left_join

In [64]:
pip install python-levenshtein

Note: you may need to restart the kernel to use updated packages.


### Clean CSV Data

In [205]:
# descriptors of cannabis strains
effects = pd.read_csv('/Users/stephaniekendall/Desktop/Flatiron/projects/Cannabis-Predictions/CSV Files/cannabis.csv', error_bad_lines=False)

# kushy csv
contents = pd.read_csv('/Users/stephaniekendall/Desktop/Flatiron/projects/Cannabis-Predictions/CSV Files/strains-kushy_api.2017-11-14.csv')

b'Skipping line 1333: expected 6 fields, saw 10\nSkipping line 1655: expected 6 fields, saw 10\nSkipping line 1656: expected 6 fields, saw 10\nSkipping line 1657: expected 6 fields, saw 10\nSkipping line 2102: expected 6 fields, saw 10\n'


In [167]:
contents = contents.rename(columns={'effects':'cont_effects'})

In [203]:
len(effects)

2348

In [140]:
effects.head()

Unnamed: 0,strain,type,rating,effects,taste,description
0,100-og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-white-widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24k-gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [141]:
print('effects length:', len(effects))
print('contents length:', len(contents))

effects length: 2348
contents length: 9524


In [129]:
# contents = contents.dropna()
effects = effects.dropna()

In [152]:
justnames = contents[['name','type']].copy()
justnames = justnames.dropna()
justnames.head()

Unnamed: 0,name,type
0,100 OG,Hybrid
1,707 Headband,Hybrid
2,A 10,Indica
3,Acapulco Gold,Sativa
4,Afghani Bullrider,Hybrid


In [216]:
# Keep description of effects for TFIDF for natural language processing


# drop columns with only zeros or NaN
contents = contents.drop(columns=['slug','image','thca','thcv','cbda','cbdv'])

# drop columns with over 80% NaNs
contents = contents.drop(columns=['description','crosses','location','terpenes','breeder'])

# drop last 7 columns of empty cbd values
contents = contents.iloc[:, 0:8]

# replace - with whitespace in name 
contents['name'] = contents['name'].str.replace('-',' ')

# remove columns the remaining columns that we don't need
# i.e. id, status, sort, slug, breeder, image, rating
contents = contents[['name','type','effects','ailment','flavor']]
effects = effects.drop(columns=['rating'])

# format strings for matching
effects['strain'] = effects['strain'].str.replace('-',' ')

effects['strain'] = effects['strain'].str.title()

In [144]:
print('effects length:', len(effects))
print('contents length:', len(contents))
print('justnames length:', len(justnames))

effects length: 2348
contents length: 9524
justnames length: 9499


In [148]:
contents.isna().sum()

name          1
type         24
effects    8509
ailment    8553
flavor     8553
dtype: int64

In [153]:
from fuzzywuzzy import fuzz

def match_name(name, list_names, min_score=0):
    # -1 score incase we don't get any matches
    max_score = -1
    # Returning empty name for no match as well
    max_name = ""
    # Iternating over all names in the other
    for name2 in list_names:
        #Finding fuzzy match score
        score = fuzz.ratio(name, name2)
        # Checking if we are above our threshold and have a better score
        if (score > min_score) & (score > max_score):
            max_name = name2
            max_score = score
    return (max_name, max_score)

# Will not run if null values are present in DataFrame

In [154]:
# List for dicts for easy dataframe creation
dict_list = []
# iterating over our players without salaries found above
for name in effects.strain:
    # Use our method to find best match, we can set a threshold here
    match = match_name(name, justnames.name, 95)
    
    # New dict for storing data
    dict_ = {}
    dict_.update({"effects" : name})
    dict_.update({"justnames" : match[0]})
    dict_.update({"score" : match[1]})
    dict_list.append(dict_)
    
merge_table = pd.DataFrame(dict_list)
# Display results
merge_table

TypeError: object of type 'float' has no len()

In [7]:
df = pd.merge(merge_table, contents, left_on='strain', right_on='strain')

In [162]:
df = df.drop(columns=['name'])
df.head()

Unnamed: 0,strain,type,rating,effects,taste
0,100 Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus"
1,98 White Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel"
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody"
3,13 Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit"
4,24K Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange"


In [None]:
df99 = pd.merge(df, )

In [163]:
df.to_csv(r'/Users/stephaniekendall/Desktop/Flatiron/projects/Cannabis-Predictions/CSV Files/matched_strains.csv')

In [175]:
df.head()

Unnamed: 0,strain,type,rating,effects,taste
0,100 Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus"
1,98 White Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel"
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody"
3,13 Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit"
4,24K Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange"


In [183]:
content = contents.dropna()

In [194]:
left = df.sort_values(by='strain')
right = content.sort_values(by='name')

In [174]:
df2.head()

Unnamed: 0,strain,type,rating,effects,taste,name,cont_effects,ailment,flavor


In [7]:
df = df.drop(columns=['name'])
df1 = df['effects'].str.get_dummies(sep=',')
df1 = pd.merge(df, df1, left_index=True, right_index=True)
df1.head()

Unnamed: 0,strain,type,rating,effects,taste,Aroused,Creative,Dry,Energetic,Euphoric,Focused,Giggly,Happy,Hungry,Mouth,None,Relaxed,Sleepy,Talkative,Tingly,Uplifted
0,100 Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",0,1,0,1,1,0,0,0,0,0,0,1,0,0,1,0
1,98 White Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1
3,13 Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,1
4,24K Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange",0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1


In [8]:
tastes = df['taste'].str.get_dummies(sep=',')
df2 = pd.merge(df, tastes, left_index=True, right_index=True)
df1 = df1.drop(columns=['effects','rating','taste'])
df2 = df2.drop(columns=['effects','rating','taste'])
df2.head()

Unnamed: 0,strain,type,Ammonia,Apple,Apricot,Berry,Blue,Blueberry,Butter,Cheese,Chemical,Chestnut,Citrus,Coffee,Diesel,Earthy,Flowery,Fruit,Grape,Grapefruit,Honey,Lavender,Lemon,Lime,Mango,Menthol,Mint,Minty,None,Nutty,Orange,Peach,Pear,Pepper,Pine,Pineapple,Plum,Pungent,Rose,Sage,Skunk,Spicy/Herbal,Strawberry,Sweet,Tar,Tea,Tobacco,Tree,Tropical,Vanilla,Violet,Woody
0,100 Og,hybrid,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,98 White Widow,hybrid,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1024,sativa,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1
3,13 Dawgs,hybrid,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,24K Gold,hybrid,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
print(len(can), len(strains))

2273 1015


### Clean API Data

In [207]:
import requests

In [208]:
response = requests.get('http://strainapi.evanbusse.com/oJPpcAq/strains/search/all')
response.status_code

200

In [209]:
data = response.json()
df3 = pd.DataFrame.from_dict(data)
df3 = df3.transpose()
df3 = pd.concat([df3.drop(['effects'], axis=1), df3['effects'].apply(pd.Series)], axis=1)
df3.reset_index(inplace=True)
df3.drop(columns=['id'], inplace=True)
df3 = df3.rename(columns={'index':'name'})
df3.head()

Unnamed: 0,name,flavors,race,positive,negative,medical
0,Afpak,"[Earthy, Chemical, Pine]",hybrid,"[Relaxed, Hungry, Happy, Sleepy]",[Dizzy],"[Depression, Insomnia, Pain, Stress, Lack of A..."
1,African,"[Spicy/Herbal, Pungent, Earthy]",sativa,"[Euphoric, Happy, Creative, Energetic, Talkative]",[Dry Mouth],"[Depression, Pain, Stress, Lack of Appetite, N..."
2,Afternoon Delight,"[Pepper, Flowery, Pine]",hybrid,"[Relaxed, Hungry, Euphoric, Uplifted, Tingly]","[Dizzy, Dry Mouth, Paranoid]","[Depression, Insomnia, Pain, Stress, Cramps, H..."
3,Afwreck,"[Pine, Earthy, Flowery]",hybrid,"[Relaxed, Happy, Creative, Uplifted, Sleepy]","[Dizzy, Dry Mouth, Paranoid, Dry Eyes]","[Pain, Stress, Headache, Fatigue, Headaches, M..."
4,Agent Orange,"[Citrus, Orange, Sweet]",hybrid,"[Relaxed, Euphoric, Happy, Energetic, Uplifted]","[Dizzy, Dry Mouth, Paranoid, Dry Eyes]","[Depression, Pain, Stress, Nausea, Headache, H..."


In [210]:
len(df3)

1970

In [212]:
# List for dicts for easy dataframe creation
dict_list1 = []
# iterating over df with more strains
for name in df.strain:
    # Use our method to find best match, we can set a threshold here
    match = match_name(name, df3.name, 90)
    
    # New dict for storing data
    dict_ = {}
    dict_.update({"strain" : name})
    dict_.update({"name" : match[0]})
    dict_list1.append(dict_)
    
merge_table1 = pd.DataFrame(dict_list)
# Display results
merge_table1

Unnamed: 0,effects,justnames,score
0,100 Og,,-1
1,98 White Widow,,-1
2,1024,1024,100
3,13 Dawgs,13 Dawgs,100
4,24K Gold,24K Gold,100
5,3 Bears Og,,-1
6,3 Kings,3 Kings,100
7,303 Og,,-1
8,3D Cbd,,-1
9,3X Crazy,3X Crazy,100


In [233]:
merge_table1.head()

Unnamed: 0,effects,justnames,score
0,100 Og,,-1
1,98 White Widow,,-1
2,1024,1024,100
3,13 Dawgs,13 Dawgs,100
4,24K Gold,24K Gold,100


In [239]:
merge_table1[merge_table1['justnames'] != '']

Unnamed: 0,effects,justnames,score
2,1024,1024,100
3,13 Dawgs,13 Dawgs,100
4,24K Gold,24K Gold,100
6,3 Kings,3 Kings,100
9,3X Crazy,3X Crazy,100
13,707 Headband,707 Headband,100
14,8 Ball Kush,8 Ball Kush,100
17,9 Pound Hammer,9 Pound Hammer,100
18,91 Krypt,91 Krypt,100
19,A 10,A 10,100


In [52]:
df3 = df3.rename(columns={'name':'names'})
df3.head()

Unnamed: 0,names,flavors,race,positive,negative,medical
0,Afpak,"[Earthy, Chemical, Pine]",hybrid,"[Relaxed, Hungry, Happy, Sleepy]",[Dizzy],"[Depression, Insomnia, Pain, Stress, Lack of A..."
1,African,"[Spicy/Herbal, Pungent, Earthy]",sativa,"[Euphoric, Happy, Creative, Energetic, Talkative]",[Dry Mouth],"[Depression, Pain, Stress, Lack of Appetite, N..."
2,Afternoon Delight,"[Pepper, Flowery, Pine]",hybrid,"[Relaxed, Hungry, Euphoric, Uplifted, Tingly]","[Dizzy, Dry Mouth, Paranoid]","[Depression, Insomnia, Pain, Stress, Cramps, H..."
3,Afwreck,"[Pine, Earthy, Flowery]",hybrid,"[Relaxed, Happy, Creative, Uplifted, Sleepy]","[Dizzy, Dry Mouth, Paranoid, Dry Eyes]","[Pain, Stress, Headache, Fatigue, Headaches, M..."
4,Agent Orange,"[Citrus, Orange, Sweet]",hybrid,"[Relaxed, Euphoric, Happy, Energetic, Uplifted]","[Dizzy, Dry Mouth, Paranoid, Dry Eyes]","[Depression, Pain, Stress, Nausea, Headache, H..."


In [227]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1970 entries, 0 to 1969
Data columns (total 6 columns):
name        1970 non-null object
flavors     1970 non-null object
race        1970 non-null object
positive    1970 non-null object
negative    1970 non-null object
medical     1970 non-null object
dtypes: object(6)
memory usage: 92.4+ KB


In [242]:
#Creating Effects & Types DataFrame
names=list(data.keys())
positive,negative,medical,posi,neg,med,eff,tye=[],[],[],[],[],[],[],[]
for n in names:
    tye.append(data[n]['race'])
    eff.append(data[n]['effects']['positive']+data[n]['effects']['negative']+data[n]['effects']['medical'])
    positive.append(data[n]['effects']['positive'])
    posi.append(len(data[n]['effects']['positive']))
    negative.append(data[n]['effects']['negative'])
    neg.append(len(data[n]['effects']['negative']))
    medical.append(data[n]['effects']['medical'])
    med.append(len(data[n]['effects']['medical']))
dic={'name':names,'type':tye,'effects':eff}
df99=pd.DataFrame(dic)

In [269]:
df99.head()

Unnamed: 0,name,type,effects
0,Afpak,hybrid,"[Relaxed, Hungry, Happy, Sleepy, Dizzy, Depres..."
1,African,sativa,"[Euphoric, Happy, Creative, Energetic, Talkati..."
2,Afternoon Delight,hybrid,"[Relaxed, Hungry, Euphoric, Uplifted, Tingly, ..."
3,Afwreck,hybrid,"[Relaxed, Happy, Creative, Uplifted, Sleepy, D..."
4,Agent Orange,hybrid,"[Relaxed, Euphoric, Happy, Energetic, Uplifted..."


In [271]:
len(df99.effects)

1970

In [257]:
#Cleaning up df, creating dummy columns for all effects
effects=[]
pos=list(eff)
for p in pos:
    for i in p:
        if i not in effects:
            effects.append(i)

# #Get dummies for type (indica=0,sativa=1,hybrid=2)
# #Engineer features for positive effect score, negative effect score, and medical effect score
# df.drop(columns='effects',inplace=True)
# df.type=df.type.map({'indica':0,'sativa':1,'hybrid':2})
# lowers=[]
# for n in df['name']:
#     lowers.append(n.lower())
# df['name']=lowers
# df['positive']=posi
# df['negative']=neg
# df['medical']=med

In [270]:
for i in effects:
    title=str(i)
    title=[]
    for x in df99.effects:
        if i in x:
            title.append(1)
        else:
            title.append(0)
    df[i]=title

ValueError: Length of values does not match length of index

### Evaluations