## Necessary Imports

In [21]:
from bs4 import BeautifulSoup
from collections import Counter
from imblearn.over_sampling import SMOTE 
from matplotlib_venn import venn2
from scipy import stats
from sklearn import tree, svm
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.externals.six import StringIO 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score 
from sklearn.metrics import auc, confusion_matrix, f1_score, roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import requests
import seaborn as sns
import warnings

# WikiLeaf Scraping

In [69]:
# webscrape 2249 strains from WikiLeaf using BeautifulSoup

# create empty lists for storing the data
names,races=[],[]

# scrape data from all 71 pages of WikiLeaf
for i in tqdm(range(1,72)):
    response=requests.get('https://www.wikileaf.com/strains/?page={}'.format(i))
    soup=BeautifulSoup(response.content,'html.parser')
    
    # use corresponding tags for names and races
    name=soup.findAll('h5',{'class':'name disp-title'})
    race=soup.findAll('p',{'class':'tag'})
   
    
    for n,r in zip(name,race):
        names.append(n.text)
        races.append(r.text)

HBox(children=(IntProgress(value=0, max=71), HTML(value='')))




In [146]:
# scrape the urls for each strain

links = []
for i in tqdm(range(1,72)):
    response=requests.get('https://www.wikileaf.com/strains/?page={}'.format(i))
    soup=BeautifulSoup(response.content,'html.parser')
    
    for b in soup.find_all('a',href=True):
        # eliminate duplicate entries 
        if b.get_attribute_list('href') not in links:
            links.append(b.get_attribute_list('href'))

HBox(children=(IntProgress(value=0, max=71), HTML(value='')))




In [143]:
# append links for singular strain to list

strain_links = []

for lin in links:
    for l in lin:
        if '/strain/' in l:
            strain_links.append('https://www.wikileaf.com'+l)

In [151]:
# scrape effects and usage data from each individual strain

for i in tqdm(strain_links):
    response=requests.get(i)
    soup=BeautifulSoup(response.content,'html.parser')
    
    for b in soup.find_all('div',{'class':'effect-label'}):
        print(b.contents)

HBox(children=(IntProgress(value=0, max=2249), HTML(value='')))

['Relaxed']
['Sleepy']
['Cotton Mouth']
['Depressed']
['A Loss of Appetite']
['Anxiety']
['Focused']
['Paranoia']
['Creative']
['Depressed']
['A Loss of Appetite']
['Anxiety']
['Paranoia']
['Cotton Mouth']
['Creative']
['Depressed']
['A Loss of Appetite']
['Anxiety']
['Paranoia']
['Cotton Mouth']
['Creative']
['Depressed']
['A Loss of Appetite']
['Anxiety']
['Paranoia']
['Cotton Mouth']
['Creative']
['Depressed']
['A Loss of Appetite']
['Anxiety']
['Focused']
['Cotton Mouth']
['Creative']
['Depressed']
['A Loss of Appetite']
['Pain']
['Relaxed']
['Cotton Mouth']
['Creative']
['Depressed']
['A Loss of Appetite']
['Anxiety']


KeyboardInterrupt: 

In [175]:
# scrape THC content data from each individual strain page

contents = []
for i in tqdm(strain_links):
    response=requests.get(i)
    soup=BeautifulSoup(response.content,'html.parser')
    
    for b in soup.find_all('small'):
        if '%' in b.text:
            contents.append(b.text)

HBox(children=(IntProgress(value=0, max=2249), HTML(value='')))

In [176]:
# slice contents list into sublists of 5 values
# each strain has 5 thc values associated to it:
# highest test, strain avg., race avg., wikileaf avg., wikileaf highest

strain_contents = [contents[i:i+5] for i in tqdm(range(0, len(contents), 5))]

HBox(children=(IntProgress(value=0, max=2249), HTML(value='')))

In [190]:
# turn strain contents to dataframe

df = pd.DataFrame(strain_contents,columns=['thc_content','strain_avg','race_avg','wikileaf_avg','wikileaf_high'])
df.head()

Unnamed: 0,thc_content,strain_avg,race_avg,wikileaf_avg,wikileaf_high
0,20%,17%,12.5%,18%,35%
1,22%,18.5%,13.5%,18%,35%
2,24%,20.5%,13.5%,18%,35%
3,17%,15.5%,13.5%,18%,35%
4,17%,15%,13.5%,18%,35%


# Leafly Scraping

In [196]:
# https://www.leafly.com/strains?sort=name&page=1
    
# webscrape 114 pages from Leafly using BeautifulSoup

# create empty lists for storing the data
names1 = []

# scrape data from all 114 pages of WikiLeaf
for i in tqdm(range(1,115)):
    response=requests.get('https://www.leafly.com/strains?sort=name&page={}'.format(i))
    soup=BeautifulSoup(response.content,'html.parser')
    
    # use corresponding tags for names and races
    name=soup.findAll('div',{'class':'strain-tile__name'})
    
    for n in name:
        names1.append(n.text)

HBox(children=(IntProgress(value=0, max=114), HTML(value='')))

In [199]:
# scrape the urls for each strain

links1 = []
for i in tqdm(range(1,115)):
    response=requests.get('https://www.leafly.com/strains?sort=name&page={}'.format(i))
    soup=BeautifulSoup(response.content,'html.parser')
    
    for b in soup.find_all('a',href=True):
        # eliminate duplicate entries 
        if b.get_attribute_list('href') not in links:
            links1.append(b.get_attribute_list('href'))

HBox(children=(IntProgress(value=0, max=114), HTML(value='')))

In [234]:
# filter for 3411 urls for singular strains

strain_links2 = []
for lin in links1:
    for l in lin:
        if '/strains/' in l 
            strain_links2.append('https://www.leafly.com'+l)

In [None]:
# scrape THC content data from each individual strain page

contents = []
for i in tqdm(strain_links2):
    response=requests.get(i)
    soup=BeautifulSoup(response.content,'html.parser')
    
    for b in soup.find_all('small'):
        if '%' in b.text:
            contents.append(b.text)

# CannaConnection Scraping

In [243]:
import string
abc0 = list(string.ascii_lowercase)

In [256]:
# https://www.cannaconnection.com/strains?show_char=a
    
# webscrape all pages from CannaConnection using BeautifulSoup

# create empty lists for storing the data
names2 = []

# scrape data from all 114 pages of WikiLeaf
for i in tqdm(abc0):
    response=requests.get('https://www.cannaconnection.com/strains?show_char={}'.format(i))
    soup=BeautifulSoup(response.content,'html.parser')
    
    # use corresponding tags for names and races
    name=soup.findAll('ul',{'class':'strains-list'})
    
    for n in name:
        names2.append(n.text.split('\n'))



HBox(children=(IntProgress(value=0, max=26), HTML(value='')))

In [257]:
len(names2)

26

In [259]:
all_names = []
for names in names2:
    for n in names:
        if len(n) > 2:
            all_names.append(n)

In [268]:
# scrape the urls for each strain

links3 = []
for i in tqdm(abc0):
    response=requests.get('https://www.cannaconnection.com/strains?show_char={}'.format(i))
    soup=BeautifulSoup(response.content,'html.parser')
    
    for b in soup.find_all('a',href=True):
        # eliminate duplicate entries 
        if b.get_attribute_list('href') not in links3:
            links3.append(b.get_attribute_list('href'))

HBox(children=(IntProgress(value=0, max=26), HTML(value='')))

In [267]:
links3

[['https://www.cannaconnection.com/'],
 ['/strains'],
 ['https://www.cannaconnection.com/strains'],
 ['https://www.cannaconnection.com/strains?show_char=a'],
 ['https://www.cannaconnection.com/strains?show_char=b'],
 ['https://www.cannaconnection.com/strains?show_char=c'],
 ['https://www.cannaconnection.com/strains?show_char=d'],
 ['https://www.cannaconnection.com/strains?show_char=e'],
 ['https://www.cannaconnection.com/strains?show_char=f'],
 ['https://www.cannaconnection.com/strains?show_char=g'],
 ['https://www.cannaconnection.com/strains?show_char=h'],
 ['https://www.cannaconnection.com/strains?show_char=i'],
 ['https://www.cannaconnection.com/strains?show_char=j'],
 ['https://www.cannaconnection.com/strains?show_char=k'],
 ['https://www.cannaconnection.com/strains?show_char=l'],
 ['https://www.cannaconnection.com/strains?show_char=m'],
 ['https://www.cannaconnection.com/strains?show_char=n'],
 ['https://www.cannaconnection.com/strains?show_char=o'],
 ['https://www.cannaconnection