## Imports

In [44]:
# import libraries and packages for scraping, classification, and visualization

from bs4 import BeautifulSoup
from collections import Counter
from imblearn.over_sampling import SMOTE 
from matplotlib_venn import venn2
from scipy import stats
from sklearn import tree, svm
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
# from sklearn.externals.six import StringIO 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score 
from sklearn.metrics import auc, confusion_matrix, f1_score, roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import requests
import seaborn as sns
import warnings

pd.set_option('display.max_columns', 500)

# Leafly Scraping

In [48]:
# webscrape 115 pages of strains from Leafly using BeautifulSoup

# create empty lists for storing the data
names,props,links=[],[],[]

# scrape data from all 115 pages of Leafly
for i in tqdm(range(1,116)):
    response=requests.get('https://www.leafly.com/strains?page={}'.format(i))
    soup=BeautifulSoup(response.content,'html.parser')
    
    # use corresponding tags for names and properties
    name=soup.findAll('div',{'class':'strain-tile__name'})
    prop=soup.findAll('span',{'class':'tag mb-md'})
    link = soup.findAll('a',{'class':'strain-tile justify-start relative'})
    
    for n,p,l in zip(name,prop,link):
        names.append(n.text)
        props.append(n.text+': ' +p.text)
        links.append('https://www.leafly.com'+l['href'])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(1,116)):


HBox(children=(FloatProgress(value=0.0, max=115.0), HTML(value='')))




In [49]:
# ensure each list is of same length
len(names), len(props), len(links)

(1540, 1540, 1540)

In [134]:
# scrape data from individual strain pages

percents = []
for i in links[:10]:
    response=requests.get(i)
    soup=BeautifulSoup(response.content,'html.parser')
    
    # thc percentage
    for b in soup.find_all('div',{'class':'font-body'}):
        percents.append(b.contents[0])

In [135]:
percents

['20',
 'This info is sourced from our readers and is not a substitute for professional medical advice. Seek the advice of a health professional before using cannabis for a medical condition.',
 '18.5',
 'This info is sourced from our readers and is not a substitute for professional medical advice. Seek the advice of a health professional before using cannabis for a medical condition.',
 '21.5',
 'This info is sourced from our readers and is not a substitute for professional medical advice. Seek the advice of a health professional before using cannabis for a medical condition.',
 '17',
 'This info is sourced from our readers and is not a substitute for professional medical advice. Seek the advice of a health professional before using cannabis for a medical condition.',
 '19',
 'This info is sourced from our readers and is not a substitute for professional medical advice. Seek the advice of a health professional before using cannabis for a medical condition.',
 '18.5',
 'This info is so

In [123]:
links[:10]

['https://www.leafly.com/strains/original-glue',
 'https://www.leafly.com/strains/blue-dream',
 'https://www.leafly.com/strains/wedding-cake',
 'https://www.leafly.com/strains/gelato',
 'https://www.leafly.com/strains/gsc',
 'https://www.leafly.com/strains/sour-diesel',
 'https://www.leafly.com/strains/purple-punch',
 'https://www.leafly.com/strains/og-kush',
 'https://www.leafly.com/strains/jack-herer',
 'https://www.leafly.com/strains/sunset-sherbert']

In [119]:
# retrieve every other element in list to filter out non-numerical entities
thc_p = []
for i in percents:
    if len(i) < 4:
        thc_p.append(i)

In [120]:
len(percents), len(thc_p)

(1681, 419)

In [122]:
thc_p

['20',
 '17',
 '19',
 '18',
 '20',
 '19',
 '17',
 '20',
 '17',
 '23',
 '1',
 '19',
 '16',
 '18',
 '1',
 '19',
 '1',
 '24',
 '19',
 '19',
 '17',
 '17',
 '1',
 '17',
 '1.5',
 '21',
 '18',
 '17',
 '19',
 '1.5',
 '19',
 '18',
 '17',
 '17',
 '19',
 '19',
 '16',
 '1',
 '17',
 '18',
 '1',
 '19',
 '17',
 '19',
 '15',
 '18',
 '1',
 '18',
 '1',
 '16',
 '20',
 '19',
 '18',
 '1',
 '16',
 '19',
 '17',
 '18',
 '21',
 '16',
 '5',
 '9',
 '17',
 '1',
 '17',
 '20',
 '17',
 '20',
 '15',
 '16',
 '15',
 '22',
 '18',
 '17',
 '17',
 '14',
 '16',
 '17',
 '18',
 '18',
 '15',
 '16',
 '20',
 '1',
 '18',
 '20',
 '19',
 '1',
 '19',
 '18',
 '17',
 '1',
 '24',
 '1',
 '17',
 '16',
 '18',
 '1',
 '19',
 '16',
 '18',
 '16',
 '20',
 '22',
 '14',
 '14',
 '20',
 '18',
 '1',
 '18',
 '16',
 '16',
 '16',
 '21',
 '20',
 '1',
 '20',
 '21',
 '22',
 '16',
 '1',
 '17',
 '18',
 '18',
 '18',
 '17',
 '18',
 '17',
 '19',
 '18',
 '24',
 '1',
 '13',
 '15',
 '18',
 '19',
 '17',
 '19',
 '21',
 '1.5',
 '16',
 '19',
 '18',
 '1',
 '22',
 '18