In [None]:
#!pip install xgboost
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV,train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost.sklearn import XGBClassifier

In [None]:
# data prep from previous module
file='/home/tcs/Srinivas/Mylearning/Edvancer_Class/DataScienceWithR/Data/Data/census_income.csv'

ci=pd.read_csv(file)

In [None]:
print(ci.head(5))

In [None]:
# there is perfect correspondance between education and education.num, we'll drop education
ci.drop('education',axis=1,inplace=True)

# convert target Y to 1,0
ci['Y']=(ci['Y']==' >50K').astype(int)

In [None]:
cat_cols=ci.select_dtypes(['object']).columns

In [None]:
cat_cols

In [None]:
for col in cat_cols:
    freqs=ci[col].value_counts()
    selected_cats=freqs.index[freqs>100][:-1]
    
    print(col)
    for cat in selected_cats:
        name=col+'_'+cat
        
        ci[name]=(ci[col]==cat).astype(int)
    del ci[col]
    

In [None]:
ci.shape

In [None]:
ci_train,ci_test=train_test_split(ci,test_size=0.2,random_state=2)

In [None]:
ci_train.reset_index(drop=True,inplace=True)
ci_test.reset_index(drop=True,inplace=True)

In [None]:
x_train=ci_train.drop('Y',axis=1)
x_test=ci_test.drop('Y',axis=1)

y_train=ci_train['Y']
y_test=ci_test['Y']

In [None]:
#https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
base_clf=DecisionTreeClassifier(max_leaf_nodes=4,class_weight='balanced')

In [None]:
print(base_clf)

In [None]:
adb_params={'n_estimators':[50,100,200,500,700],
           'learning_rate': [0.01,.1,1]
           }
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
adb=AdaBoostClassifier(base_estimator=base_clf)
# you will need to tune parameters for base classifier [ which is being boosted ] separately 

In [None]:
print(adb)
#print(adb_params)

In [None]:
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
complete_search=GridSearchCV(adb,scoring='roc_auc',param_grid=adb_params,cv=10,n_jobs=-1,verbose=2)

In [None]:
complete_search.fit(x_train,y_train)
# this might take upto 30-45 miins to finish ( dont be impatient :) . Also dont be alarmed if it finishes early :))

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.5f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [None]:
report(complete_search.cv_results_,5)

In [None]:
adb_best=AdaBoostClassifier(base_estimator=base_clf,learning_rate=0.1,n_estimators=500)

In [None]:
adb_best.fit(x_train,y_train)

In [None]:
p=adb_best.predict_proba(x_test)[:,1]

In [None]:
adb_best.classes_

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_test,p)

In [None]:
param_dist = {
              "max_depth": [2,3,4,5,6],
              "learning_rate":[0.01,0.05,0.1,0.3,0.5],
    "min_child_weight":[4,5,6],
              "subsample":[i/10.0 for i in range(6,10)],
 "colsample_bytree":[i/10.0 for i in range(6,10)],
               "reg_alpha":[1e-5, 1e-2, 0.1, 1, 100],
              "gamma":[i/10.0 for i in range(0,5)],
    "n_estimators":[100,500,700,1000],
    'scale_pos_weight':[2,3,4,5,6,7,8,9]
    
              }


In [None]:
clf=XGBClassifier(objective='binary:logistic')

In [None]:
n_iter=10

random_search=RandomizedSearchCV(clf,n_jobs=-1,verbose=2,cv=10,n_iter=n_iter,scoring='roc_auc',
                                 param_distributions=param_dist)

In [None]:
random_search.fit(x_train,y_train)

In [None]:
report(random_search.cv_results_,5)

In [None]:
xgb_best=XGBClassifier(subsample=0.8,scale_pos_weight=3,reg_alpha=1e-05,n_estimators=700,min_child_weight=4,
                       max_depth=4,learning_rate=0.05,gamma=0.3,colsample_bytree=0.8
                      )

In [None]:
xgb_best.fit(x_train,y_train)

In [None]:
p=xgb_best.predict_proba(x_test)[:,1]

In [None]:
roc_auc_score(y_test,p)