## Necessary Imports

In [21]:
from bs4 import BeautifulSoup
from collections import Counter
from imblearn.over_sampling import SMOTE 
from matplotlib_venn import venn2
from scipy import stats
from sklearn import tree, svm
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.externals.six import StringIO 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score 
from sklearn.metrics import auc, confusion_matrix, f1_score, roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import requests
import seaborn as sns
import warnings

# WikiLeaf Scraping

In [69]:
# webscrape 2249 strains from WikiLeaf using BeautifulSoup

# create empty lists for storing the data
names,races=[],[]

# scrape data from all 71 pages of WikiLeaf
for i in tqdm(range(1,72)):
    response=requests.get('https://www.wikileaf.com/strains/?page={}'.format(i))
    soup=BeautifulSoup(response.content,'html.parser')
    
    # use corresponding tags for names and races
    name=soup.findAll('h5',{'class':'name disp-title'})
    race=soup.findAll('p',{'class':'tag'})
   
    
    for n,r in zip(name,race):
        names.append(n.text)
        races.append(r.text)

HBox(children=(IntProgress(value=0, max=71), HTML(value='')))




In [146]:
# scrape the urls for each strain

links = []
for i in tqdm(range(1,72)):
    response=requests.get('https://www.wikileaf.com/strains/?page={}'.format(i))
    soup=BeautifulSoup(response.content,'html.parser')
    
    for b in soup.find_all('a',href=True):
        # eliminate duplicate entries 
        if b.get_attribute_list('href') not in links:
            links.append(b.get_attribute_list('href'))

HBox(children=(IntProgress(value=0, max=71), HTML(value='')))




In [143]:
# append links for singular strain to list

strain_links = []

for lin in links:
    for l in lin:
        if '/strain/' in l:
            strain_links.append('https://www.wikileaf.com'+l)

In [151]:
# scrape effects and usage data from each individual strain

for i in tqdm(strain_links):
    response=requests.get(i)
    soup=BeautifulSoup(response.content,'html.parser')
    
    for b in soup.find_all('div',{'class':'effect-label'}):
        print(b.contents)

HBox(children=(IntProgress(value=0, max=2249), HTML(value='')))

['Relaxed']
['Sleepy']
['Cotton Mouth']
['Depressed']
['A Loss of Appetite']
['Anxiety']
['Focused']
['Paranoia']
['Creative']
['Depressed']
['A Loss of Appetite']
['Anxiety']
['Paranoia']
['Cotton Mouth']
['Creative']
['Depressed']
['A Loss of Appetite']
['Anxiety']
['Paranoia']
['Cotton Mouth']
['Creative']
['Depressed']
['A Loss of Appetite']
['Anxiety']
['Paranoia']
['Cotton Mouth']
['Creative']
['Depressed']
['A Loss of Appetite']
['Anxiety']
['Focused']
['Cotton Mouth']
['Creative']
['Depressed']
['A Loss of Appetite']
['Pain']
['Relaxed']
['Cotton Mouth']
['Creative']
['Depressed']
['A Loss of Appetite']
['Anxiety']


KeyboardInterrupt: 

In [163]:
# scrape THC content data from each individual strain

for i in tqdm(strain_links):
    response=requests.get(i)
    soup=BeautifulSoup(response.content,'html.parser')
    
    for b in soup.find_all('small'):
        print(b.text)

HBox(children=(IntProgress(value=0, max=2249), HTML(value='')))

6/10
4/10
4/10
10/10
10/10
6/10
8/10
4/10
10/10
6/10
20%
17%
12.5%
18%
35%
10/10
6/10
8/10
2/10
6/10
2/10
2/10
8/10
10/10
8/10
22%
18.5%
13.5%
18%
35%
10/10
2/10
6/10
2/10
8/10
2/10
4/10
6/10
10/10
8/10
24%
20.5%
13.5%
18%
35%
10/10
4/10
8/10
2/10
4/10
2/10
6/10
10/10
8/10
8/10
17%
15.5%
13.5%
18%
35%
6/10
4/10
8/10
6/10
10/10
4/10
2/10
6/10
4/10
10/10
17%
15%
13.5%
18%
35%
8/10
2/10
2/10
6/10
4/10
6/10
8/10
6/10
10/10
25%
21%
13.5%
18%
35%
8/10
6/10
10/10
8/10
6/10
6/10
2/10
8/10
6/10
10/10
27%
24%
13%
18%
35%
10/10
2/10
8/10
8/10
4/10
6/10
6/10
10/10
6/10
4/10
22%
18.5%
13.5%
18%
35%
6/10
2/10
8/10
10/10
2/10
4/10
2/10
6/10
4/10
10/10
20%
16%
13.5%
18%
35%
6/10
8/10
4/10
10/10
8/10
2/10
2/10
8/10
4/10
10/10
22%
18.5%
13.5%
18%
35%
2/10
8/10
4/10
4/10
10/10
2/10
18%
16%
13.5%
18%
35%
8/10
6/10
10/10
8/10
4/10
2/10
4/10
6/10
8/10
10/10
24%
20.5%
13.5%
18%
35%
4/10
6/10
4/10
4/10
4/10
10/10
22%
18.5%
13%
18%
35%
8/10
4/10
6/10
10/10
4/10
4/10
6/10
2/10
6/10
10/10
22%
19%
12.5%
18%
35%
4

KeyboardInterrupt: 