**Importing Required Libraries**

In [None]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

import config
import mysql.connector
from mysql.connector import errorcode

import requests
import pydotplus
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import plot_tree
from xgboost.sklearn import XGBClassifier

from scipy.sparse import csr_matrix
from imblearn.over_sampling import SMOTE

from time import time
import xgboost as xgb
from sklearn.svm import SVC  
from sklearn import tree, metrics, svm, datasets
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, label_binarize
from sklearn.tree import export_graphviz, DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, average_precision_score, precision_score, f1_score
from sklearn.decomposition import PCA

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Collecting Relevant Data

## Webscraping

In [None]:
driver = webdriver.Chrome()
driver.get('https://www.whatbird.com/browse/attributes.aspx')
location_table = driver.find_element_by_class_name('BrowseTable')
links = location_table.find_elements_by_css_selector("a")

bird_families = [] 
bird_chars = []
for i in tqdm(range(1, len(links))):
    locale = driver.find_element_by_class_name('BrowseTable').find_elements_by_css_selector("a")[i]
    locale.click()
    landing_page = driver.current_url
    for ind in tqdm(range(len(driver.find_elements_by_xpath("//*[contains(@class,'ObjectLink')]")))):
        location_page = driver.current_url
        driver.find_elements_by_xpath("//*[contains(@class,'ObjectLink')]")[ind].click()
        driver.find_element_by_link_text('Identification').click()
        bird_chars_table = driver.find_element_by_class_name('contentbg')
        char_list = bird_chars_table.find_elements_by_css_selector('li')
        bird_text = [driver.find_element_by_id('BirdName').text]
        for i in char_list:
            bird_text.append(i.text)

        bird_chars.append(bird_text)
        bird_families.append(driver.find_element_by_id("Family").text)
        driver.get(location_page)

    driver.get('https://www.whatbird.com/browse/attributes.aspx')

## Combining Scraped Data and Removing Duplicates

In [None]:
birds = []
for elem in bird_chars:
    if elem not in birds:
        birds.append(elem)

## Connecting to MySQL/Creating a Database

In [None]:
db_name = 'Bird_Classifier_Project'
cnx = mysql.connector.connect(
    host = config.host,
    user = config.user,
    passwd = config.pw

)
cur = cnx.cursor()

In [None]:
def create_database(cursor, database):
    try:
        cur.execute(
            "CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(database))
    except mysql.connector.Error as err:
        print("Failed creating database: {}".format(err))
        exit(1)

try:
    cur.execute("USE {}".format(db_name))
except mysql.connector.Error as err:
    print("Database {} does not exists.".format(db_name))
    if err.errno == errorcode.ER_BAD_DB_ERROR:
        create_database(cur, db_name)
        print("Database {} created successfully.".format(db_name))
        cnx.database = db_name
    else:
        print(err)
        exit(1)        

In [None]:
cur.execute("""
CREATE TABLE birds (
Family VARCHAR(255),
Name VARCHAR(255),
Bill_Shape VARCHAR(255),
Eye_Color VARCHAR(255),
Head_Pattern VARCHAR(255),
Crown_Color VARCHAR(255),
Forehead_Color VARCHAR(255),
Nape_Color VARCHAR(255),
Throat_Color VARCHAR(255),
Cere_Color VARCHAR(255),
Length_Range VARCHAR(255),
Weight VARCHAR(255),
Size VARCHAR(255),
Primary_Color VARCHAR(255),
Underparts VARCHAR(255),
Upperparts VARCHAR(255),
Back_Pattern VARCHAR(255),
Belly_Pattern VARCHAR(255),
Breast_Pattern VARCHAR(255),
Flight_Pattern VARCHAR(255),
Wingspan VARCHAR(255),
Wing_Shape VARCHAR(255),
Tail_Shape VARCHAR(255),
Tail_Pattern VARCHAR(255),
Upper_Tail VARCHAR(255),
Under_Tail VARCHAR(255),
Leg_Color VARCHAR(255)
)
""")

In [None]:
cleaner_list = []
for i in range(len(bird_chars)):
    cleaner_list.append(tuple((bird_chars[1][0][0],bird_chars[i][1:])))

In [None]:
nested_lists = []
for i in range(len(cleaner_list)):
    nested_lists.append([clieaner_list[i][0]]+cleaner_list[i][1])

In [None]:
unique_birds = list(nested_lists for nested_lists,_ in itertools.groupby(nested_lists))

In [None]:
for i in unique_birds:
    del i[0]

In [None]:
df = pd.DataFrame(unique_birds)
df['Family'] = bird_families

In [None]:
df.to_csv('bird_dataset.csv')

In [None]:
df.drop_duplicates(subset ="0", 
                     keep = 'first', inplace = True) 

In [None]:
df.columns = df.columns.astype(str)
subset = df[['Family','0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19', '20', '21', '22', '23', '24', '25']]
tuples = [tuple(x) for x in subset.values]

In [None]:
cnx = mysql.connector.connect(
    host = config.host,
    user = config.user,
    passwd = config.pw,
    database = db_name
)
cur = cnx.cursor()

In [None]:
for i in tqdm(tuples):
    try:
        stmt = """INSERT INTO birds (Family, Name, Bill_Shape, Eye_Color, Head_Pattern, Crown_Color, Forehead_Color,
        Nape_Color, Throat_Color, Cere_Color, Length_Range, Weight, Size, Primary_Color, Underparts, Upperparts, 
        Back_Pattern, Belly_Pattern, Breast_Pattern, Flight_Pattern, Wingspan, Wing_Shape, Tail_Shape, Tail_Pattern,
        Upper_Tail, Under_Tail, Leg_Color) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
        
        """
        cur.execute(stmt, (i))
        cnx.commit()
    except mysql.connector.Error as err:
        print(err, i)

In [None]:
cur.execute("""SELECT * FROM birds""")
df1 = pd.DataFrame(cur.fetchall())
df1.columns = [x[0] for x in cur.description]

## Cleaning Up Columns

In [None]:
fams = list(df1.Family)

new_fams = []
for i in fams:
    new_fams.append(i.split("("))

real_fams = []
for i in new_fams:
    real_fams.append(i[1].split(")")[0])

df1['Family'] = real_fams

In [None]:
df1.Eye_Color = df1.Eye_Color.str[11:]

In [None]:
df1 = df1[~df1.Bill_Shape.str.contains("Eye Color")]
df1 = df1[~df1.Eye_Color.str.contains("Eye Color")]

In [None]:
### Removing redundant column label in each row ###

df1.Bill_Shape = df1.Bill_Shape.str[12:]
df1.Eye_Color = df1.Eye_Color.str[11:]
df1.Head_Pattern = df1.Head_Pattern.str[14:]
df1.Crown_Color = df1.Crown_Color.str[13:]
df1.Forehead_Color = df1.Forehead_Color.str[16:]
df1.Nape_Color = df1.Nape_Color.str[12:]
df1.Throat_Color = df1.Throat_Color.str[14:]
df1.Cere_Color = df1.Cere_Color.str[12:]
df1.Length_Range = df1.Length_Range.str[14:]
df1.Weight = df1.Weight.str[8:]
df1.Size = df1.Size.str[5:]
df1.Primary_Color = df1.Primary_Color.str[15:]
df1.Underparts = df1.Underparts.str[12:]
df1.Upperparts = df1.Upperparts.str[12:]
df1.Back_Pattern = df1.Back_Pattern.str[14:]
df1.Belly_Pattern = df1.Belly_Pattern.str[15:]
df1.Breast_Pattern = df1.Breast_Pattern.str[16:]
df1.Flight_Pattern = df1.Flight_Pattern.str[16:]
df1.Wingspan = df1.Wingspan.str[10:]
df1.Wing_Shape = df1.Wing_Shape.str[11:]
df1.Tail_Shape = df1.Tail_Shape.str[12:]
df1.Tail_Pattern = df1.Tail_Pattern.str[14:]
df1.Upper_Tail = df1.Upper_Tail.str[12:]
df1.Under_Tail = df1.Under_Tail.str[12:]
df1.Leg_Color = df1.Leg_Color.str[11:]

In [None]:
df1 = df1.drop(columns=['Eye_Color'])

# Training a Vanilla Classifier (Decision Tree)

In [None]:
X = pd.get_dummies(df1.drop(columns=['Family','Name','Size','Length_Range','Weight','Wingspan']))
y = (df1['Family'])

X_train,X_test, y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state=10)

In [None]:
dtclass = DecisionTreeClassifier(criterion='entropy',random_state=10)
dtclass.fit(X_train,y_train)

In [None]:
y_preds = dtclass.predict(X_test)


In [None]:
# Calculate Accuracy 
acc = accuracy_score(y_test,y_preds)*100
print("Accuracy score is {:.4}%".format(acc))

**Initial Testing Score of 59.18%. Nice!**

In [None]:
def performance(y_true, y_predict):
    """ 
    Calculates and returns the performance score between 
    true and predicted values based on the metric chosen.
    """    
    r2 = (metrics.r2_score(y_true,y_predict))
    mse = metrics.mean_squared_error(y_true,y_predict)
    return r2,mse
performance(y_test,y_preds)

# Bagged Tree

In [None]:
bagged_tree = BaggingClassifier(DecisionTreeClassifier(criterion='gini', max_depth=5,random_state=10),n_estimators=20,random_state=10)
bagged_tree.fit(X_train,y_train)

In [None]:
print(bagged_tree.score(X_train,y_train))
print(bagged_tree.score(X_test,y_test))

# Random Forest

## Vanilla Model

In [None]:
forest = RandomForestClassifier(n_estimators=100,max_depth=5,random_state=10)
forest.fit(X_train,y_train)

In [None]:
print(forest.score(X_train,y_train))
print(forest.score(X_test,y_test))

In [None]:
dt_cv_score = cross_val_score(dtclass,X,y,cv=3)
mean_dt_cv_score = np.mean(dt_cv_score)
print("Mean Cross Validation Score: {:.4}%".format(mean_dt_cv_score*100))

## Gridsearching Random Forest

In [None]:
dt_param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 2, 3, 4, 5, 6, 8, 10, 12, 14, 16 ,18, 20, 22, 24, 26, 28, 30, 32, 34],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf" : [1, 2, 3, 4, 5, 6]
}
dt_grid_search = GridSearchCV(dtclass,dt_param_grid,cv=3,return_train_score=True)

dt_grid_search.fit(X, y)

In [None]:
dt_gs_training_score = dt_grid_search.cv_results_['mean_train_score']
dt_gs_testing_score = dt_grid_search.score(X,y)

print("Mean Training Score: {:.4}%".format(np.mean(dt_gs_training_score) * 100))
print("Mean Testing Score: {:.4}%".format(np.mean(dt_gs_testing_score) * 100))
print("Best Parameter Combination Found During Grid Search:")
dt_grid_search.best_params_

In [None]:
forest_class = RandomForestClassifier()
dt_cv_score = cross_val_score(forest_class,X,y,cv=3)
mean_rf_cv_score = np.mean(dt_cv_score)
print("Mean Cross Validation Score for Random Forest Classifier: {:.4}%".format(mean_rf_cv_score * 100))

In [None]:
rf_param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 2, 3, 4, 5, 6, 8, 10, 12, 14, 16 ,18, 20, 22, 24, 26, 28, 30, 32, 34],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf" : [1, 2, 3, 4, 5, 6],
    "n_estimators" : [10, 30, 100]
}

In [None]:
start = time.time()
rf_grid_search = GridSearchCV(forest_class,rf_param_grid,cv=3)
rf_grid_search.fit(X, y)

print("Testing Accuracy: {:.4}%".format(rf_grid_search.best_score_ * 100))
print("Total Runtime for Grid Search on Random Forest Classifier: {:.4} seconds".format(time.time() - start))
print("")
print("Optimal Parameters: {}".format(rf_grid_search.best_params_))

 **Up to 66.38%. Better!**

## Changed Target to Size and Combined 5 categories into 3

In [None]:
df1.loc[df1['Size'] == ' Very Small (3 - 5 in)', 'Size'] = ' Small (5 - 9 in)'
df1.loc[df1['Size'] == ' Very Large (32 - 72 in)'] = ' Large (16 - 32 in)'

In [None]:
df1.Size.value_counts()

In [None]:
X = pd.get_dummies(df1.drop(columns=['Family','Name','Size','Length_Range','Weight','Wingspan']))
y = (df1['Size'])

X_train,X_test, y_train,y_test = train_test_split(X,y,test_size = 0.4,random_state=10)

In [None]:
forest = RandomForestClassifier(random_state=10)
forest.fit(X_train,y_train)

In [None]:
forest.score(X_test,y_test)

In [None]:
dt_cv_score = cross_val_score(forest,X,y,cv=3)
mean_dt_cv_score = np.mean(dt_cv_score)
print("Mean Cross Validation Score: {:.4}%".format(mean_dt_cv_score*100))

In [None]:
forest_param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 2, 3, 4, 5, 6, 8, 10, 12, 14, 16 ,18, 20, 22, 24, 26, 28, 30, 32, 34],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf" : [1, 2, 3, 4, 5, 6]
}

dt_grid_search = GridSearchCV(forest,forest_param_grid,cv=3,return_train_score=True)

dt_grid_search.fit(X, y)

dt_gs_training_score = dt_grid_search.cv_results_['mean_train_score']
dt_gs_testing_score = dt_grid_search.score(X,y)

print("Mean Training Score: {:.4}%".format(np.mean(dt_gs_training_score) * 100))
print("Mean Testing Score: {:.4}%".format(np.mean(dt_gs_testing_score) * 100))
print("Best Parameter Combination Found During Grid Search:")
dt_grid_search.best_params_

In [None]:
forest = RandomForestClassifier(random_state=10, max_depth=34, min_samples_leaf=1, min_samples_split=5, criterion='gini')
forest.fit(X_train,y_train)

In [None]:
dt_cv_score = cross_val_score(forest,X,y,cv=3)
mean_dt_cv_score = np.mean(dt_cv_score)
print("Mean Cross Validation Score: {:.4}%".format(mean_dt_cv_score*100))

In [None]:
y_pred = forest.predict(X_test)

In [None]:
cm = confusion_matrix(y_test,y_pred)
classes = ['Small','Medium','Large']
plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues)
print(precision_score(y_test, y_pred,average=None))

## Support Vector Classifier

In [None]:
tic = time()
svclassifier = SVC(kernel='rbf', C=1)  
svclassifier.fit(X_train, y_train) 
y_pred = svclassifier.predict(X_test)
toc = time()
print("run time is {} seconds".format(toc-tic))

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred)) 
print("The accuracy score is" + " "+ str(accuracy_score(y_test, y_pred)))

In [None]:
def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [None]:
svc_param_selection(X, y, 5)

## Re-running SVC with Optimal Parameters

In [None]:

tic = time()
svclassifier = SVC(kernel='rbf', C=10, gamma=0.01)  
svclassifier.fit(X_train, y_train) 
y_pred = svclassifier.predict(X_test)
toc = time()
print("run time is {} seconds".format(toc-tic))

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred)) 
print("The accuracy score is" + " "+ str(accuracy_score(y_test, y_pred)))

## Using Synthetic Minority Over-sampling Technique (SMOTE) on Imbalanced Dataset

In [None]:
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)

In [None]:
smote = SVC(kernel='rbf', C=10, gamma=0.01).fit(X_train,y_train)
smote_pred = smote.predict(X_test)

# Checking accuracy
accuracy_score(y_test, smote_pred)

In [None]:
print(confusion_matrix(y_test, smote_pred))
print(classification_report(y_test,smote_pred))

In [None]:
df1.to_csv('real_bird_dataset.csv')

## Gridsearched Neural Network Using Bag of Words

In [None]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df1.Flight_Pattern)

In [None]:
x_words = X.toarray()

In [None]:
x_words = pd.DataFrame(x_words)

In [None]:
x_words.reset_index(drop=True, inplace=True)
df1.reset_index(drop=True, inplace=True)

In [None]:
bow_X = pd.concat([df1,x_words], axis=1, ignore_index=True)

In [None]:
bow_X = bow_X.drop(columns=[0,1,9,10,11,19])

In [None]:
bow_with_features = pd.get_dummies(bow_X)
y = df1['Size']

In [None]:
X_train,X_test, y_train,y_test = train_test_split(bow_with_features,y,test_size=0.3)

In [None]:
forest = RandomForestClassifier(random_state=45)
forest.fit(X_train,y_train)

In [None]:
dt_cv_score = cross_val_score(forest,X,y,cv=3)
mean_dt_cv_score = np.mean(dt_cv_score)
print("Mean Cross Validation Score: {:.4}%".format(mean_dt_cv_score*100))

In [None]:
rf_param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 2, 3, 4, 5, 6, 8, 10, 12, 14, 16 ,18, 20, 22, 24, 26, 28, 30, 32, 34],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf" : [1, 2, 3, 4, 5, 6],
    "n_estimators" : [10, 30, 100]
}

In [None]:
dt_grid_search = GridSearchCV(forest,rf_param_grid,cv=3,return_train_score=True)

dt_grid_search.fit(X, y)

In [None]:
print("Testing Accuracy: {:.4}%".format(dt_grid_search.best_score_ * 100))
print("")
print("Optimal Parameters: {}".format(dt_grid_search.best_params_))

In [None]:
### SMOTE ###
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)

## Random-searched SVC Using Bag of Words

In [None]:
svc_param_selection(bow_with_features, y, 5)

In [None]:
smote = SVC(kernel='rbf', C=10, gamma=0.01,random_state=10).fit(X_train,y_train)
smote_pred = smote.predict(X_test)

# Checking accuracy
accuracy_score(y_test, smote_pred)

In [None]:
print(confusion_matrix(y_test, smote_pred))
print(classification_report(y_test,smote_pred))
print("The accuracy score is" + " "+ str(accuracy_score(y_test, smote_pred)))

In [None]:
def svc_randomizedsearchCV(X, y, nfolds):
    degrees = [2,3,4,5]
    kernels = ['rbf','poly']
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas, 'kernel': kernels, 'degree': degrees}
    grid_search = RandomizedSearchCV(svm.SVC(), param_grid, cv=nfolds, n_jobs=-1)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [None]:
svc_randomizedsearchCV(bow_with_features,y,5)

In [None]:
smote = SVC(kernel='rbf', C=1, gamma=0.01, degree=5,random_state=10).fit(X_train,y_train)
smote_pred = smote.predict(X_test)

In [None]:
precision_score(y_test, smote_pred,average=None)

In [None]:
print(confusion_matrix(y_test, smote_pred))
print(classification_report(y_test,smote_pred)) 
print("The accuracy score is" + " "+ str(accuracy_score(y_test, smote_pred)))

**BEST MODEL  ^^^^^**

# Attempting Better Score Using Principle Component Analysis


In [None]:
X = bow_with_features
y = (df1['Size'])

X_train,X_test, y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state=10)

In [None]:
pca = PCA()
transformed = pca.fit_transform(X)

# Your code here 

first_two_comp = (transformed[:,0], transformed[:,1]);

In [None]:
plt.scatter(transformed[:,0], transformed[:,1]);

In [None]:
cov = np.cov(transformed)

In [None]:
eigen_value, eigen_vector = np.linalg.eig(cov)

In [None]:
e_indices = np.argsort(eigen_value)[::-1] #Get the index values of the sorted eigenvalues
eigenvectors_sorted = eigen_vector[:,e_indices]

In [None]:
transformed = eigenvectors_sorted.dot(transformed).T

In [None]:
pcs=pd.DataFrame(first_two_comp).T

In [None]:
print('Variance of each component:', pca.explained_variance_ratio_)
print('\n Total Variance Explained:', round(sum(list(pca.explained_variance_ratio_))*100, 2))

In [None]:
X = pcs
y = (df1['Size'])
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)
model = SVC(kernel='rbf', C=10, gamma=0.001, degree=4).fit(X_train,y_train)
model.fit(X_train, Y_train)
Yhat = model.predict(X_test)
acc = metrics.accuracy_score(Yhat, Y_test)
end = time.time()
print("Accuracy:",acc)
print ("Time Taken:", end - start)

**After playing around with the different eigenvalues, accuracy using PCA isn't as good as our SVC model**

# Some Classification Evaluation! // BEST MODEL: SMOTE SVC

In [None]:
# Survival Count
print('Target Variable')
print(df1.groupby(['Size']).Size.count())

# Target Variable Countplot
sns.set_style('darkgrid')
plt.figure(figsize = (10,5))
sns.countplot(df1['Size'], alpha =.80, palette= ['yellowgreen','lightgreen','darkgreen'])
plt.title('Bird Sizes')
plt.ylabel('# of Birds')
plt.show()

In [None]:
cm = confusion_matrix(y_test,smote_pred)
classes = ['Small','Medium','Large']
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion Matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues)
print(precision_score(y_test, smote_pred,average=None))

In [None]:
# print the first 25 true and predicted responses
print('True:', y_test.values[0:25])
print('Pred:', smote_pred[0:25])

In [None]:
print(metrics.precision_score(y_test, smote_pred, average=None))
print(metrics.recall_score(y_test, smote_pred, average=None))

In [None]:
#calculating the F1 score
2*(metrics.precision_score(y_test, smote_pred,average=None)*metrics.recall_score(y_test, smote_pred,average=None))/(metrics.precision_score(y_test, smote_pred,average=None)+metrics.recall_score(y_test, smote_pred,average=None))
   

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
THRESHOLD = 0.5
preds = np.where(logreg.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

pd.DataFrame(data=[np.mean(metrics.recall_score(y_test, smote_pred,average=None)),
                   np.mean(metrics.precision_score(y_test, smote_pred,average=None)), np.mean(metrics.f1_score(y_test, smote_pred,average=None))], 
             index=["recall", "precision", "F1"])

In [None]:
y_pred_prob = logreg.predict_proba(X_test)[:, 2]
plt.hist(y_pred_prob, bins=8)
plt.xlim(0, 1)
plt.title('Histogram of Predicted Probabilities')
plt.xlabel('Predicted Probability of Bird Being Medium')
plt.ylabel('Frequency')

## AdaBoost

In [None]:
adaboost_clf = AdaBoostClassifier()
adaboost_mean_cv_score = np.mean(cross_val_score(adaboost_clf, X, y, cv=3))
print("Mean Cross Validation Score for AdaBoost: {:.4}%".format(adaboost_mean_cv_score * 100))

In [None]:
adaboost_param_grid = {
    'n_estimators': [50,100,250, 500],
    'learning_rate': [1.0, 0.5, 0.1]
}

In [None]:
adaboost_grid_search = GridSearchCV(adaboost_clf, adaboost_param_grid, cv=3)
adaboost_grid_search.fit(X, y)
print("Testing Accuracy: {:.4}%".format(adaboost_grid_search.best_score_ * 100))
print("Total Runtime for Grid Search on AdaBoost: {:.4} seconds".format(time.time() - start))
print("")
print("Optimal Parameters: {}".format(adaboost_grid_search.best_params_))

In [None]:
gbt_clf = GradientBoostingClassifier()
gbt_clf.fit(X_train, y_train)


In [None]:
gbt_clf_train_preds = gbt_clf.predict(X_train)
gbt_clf_test_preds = gbt_clf.predict(X_test)

In [None]:
def display_acc_and_f1_score(true, preds, model_name):
    acc = accuracy_score(true, preds)
    f1 = f1_score(true, preds, average='weighted')
    print("Model: {}".format(model_name))
    print("Accuracy: {}".format(acc))
    print("F1-Score: {}".format(f1))
    

display_acc_and_f1_score(y_train, gbt_clf_train_preds, model_name='Gradient Boosted Trees')
print("")
display_acc_and_f1_score(y_test, gbt_clf_test_preds, model_name='Gradient Boosted Trees')

In [None]:
gbt_confusion_matrix = confusion_matrix(y_test, gbt_clf_test_preds)
print(gbt_confusion_matrix)
gbt_classification_report = classification_report(y_test, gbt_clf_test_preds)
print(gbt_classification_report)



In [None]:
print('Mean GBT Cross-Val Score (k=5):')
print(cross_val_score(gbt_clf, X, y, cv=5).mean())

## XG Boost

In [None]:
X = bow_with_features
y = (df1['Size'])
X.columns = X.columns.astype(str)
X.columns = X.columns.str.replace(' ', '_')
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state=10)

In [None]:
xgb_param_grid = {
    "learning_rate": [0.1],
    'max_depth': [6],
    'min_child_weight': [10],
    'subsample': [ 0.7],
    'n_estimators': [5, 30, 100, 250],
}

In [None]:
clfX=XGBClassifier()
clfX.fit(X_train, y_train)

In [None]:
grid_clf = GridSearchCV(clfX, xgb_param_grid, scoring='accuracy', cv=5, n_jobs=1)
grid_clf.fit(bow_with_features, y)
best_parameters = grid_clf.best_params_
print("Grid Search found the following optimal parameters: ")
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
training_preds = grid_clf.predict(X_train)
val_preds = grid_clf.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)
print("")
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))
print("Testing Accuracy(Best Score): {:.4}%".format(grid_clf.best_score_ * 100))


In [None]:
def plot_feature_importances(model):
    n_features = X_train.shape[1]
    plt.figure(figsize=(13,10))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X_train.columns.values) 
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")

In [None]:
from xgboost import plot_importance
plot_importance(clfX,max_num_features=20);

In [None]:
plot_tree(clfX, num_trees=1)
fig = plt.gcf()
fig.set_size_inches(100, 60)