<a href="https://colab.research.google.com/github/sauravsingla/General/blob/master/LightGBM_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install lightgbm 
!pip install xgboost 



In [None]:
#importing standard libraries 
import numpy as np 
import pandas as pd 
from pandas import Series, DataFrame 

#import lightgbm and xgboost 
import lightgbm as lgb 
import xgboost as xgb 

#loading our training dataset 'adult.csv' with name 'data' using pandas 
data=pd.read_csv('adult.data',header=None, delimiter=r"\s+",) 

#Assigning names to the columns 
data.columns=['age','workclass','fnlwgt','education','education-num','marital_Status','occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country','Income'] 

#glimpse of the dataset 
data.head() 

# Label Encoding our target variable 
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
l=LabelEncoder() 
l.fit(data.Income) 

l.classes_ 
data.Income=Series(l.transform(data.Income))  #label encoding our target variable 
data.Income.value_counts() 

0    24720
1     7841
Name: Income, dtype: int64

In [None]:
#One Hot Encoding of the Categorical features 
one_hot_workclass=pd.get_dummies(data.workclass) 
one_hot_education=pd.get_dummies(data.education) 
one_hot_marital_Status=pd.get_dummies(data.marital_Status) 
one_hot_occupation=pd.get_dummies(data.occupation)
one_hot_relationship=pd.get_dummies(data.relationship) 
one_hot_race=pd.get_dummies(data.race) 
one_hot_sex=pd.get_dummies(data.sex) 
one_hot_native_country=pd.get_dummies(data.native_country) 

#removing categorical features 
data.drop(['workclass','education','marital_Status','occupation','relationship','race','sex','native_country'],axis=1,inplace=True) 


In [None]:
#Merging one hot encoded features with our dataset 'data' 
data=pd.concat([data,one_hot_workclass,one_hot_education,one_hot_marital_Status,one_hot_occupation,one_hot_relationship,one_hot_race,one_hot_sex,one_hot_native_country],axis=1) 

#removing dulpicate columns 
_, i = np.unique(data.columns, return_index=True) 
data=data.iloc[:, i] 

#Here our target variable is 'Income' with values as 1 or 0.  
#Separating our data into features dataset x and our target dataset y 
x=data.drop('Income',axis=1) 
y=data.Income 

 

#Imputing missing values in our target variable 
y.fillna(y.mode()[0],inplace=True) 

#Now splitting our dataset into test and train 
from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.3)

In [None]:
data.describe()

Unnamed: 0,"10th,","11th,","12th,","1st-4th,","5th-6th,","7th-8th,","9th,","?,","Adm-clerical,","Amer-Indian-Eskimo,",...,"Transport-moving,","Trinadad&Tobago,","United-States,","Unmarried,","Vietnam,","White,","Widowed,","Wife,","Without-pay,","Yugoslavia,"
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,...,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,0.028654,0.036086,0.013298,0.00516,0.010227,0.01984,0.015786,0.056386,0.115783,0.009551,...,0.049046,0.000584,0.895857,0.105832,0.002058,0.854274,0.030497,0.048156,0.00043,0.000491
std,0.166834,0.186507,0.11455,0.071646,0.100612,0.139451,0.124648,0.23067,0.319969,0.097264,...,0.215968,0.024149,0.305451,0.307627,0.045316,0.352837,0.171952,0.214099,0.020731,0.022162
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
#The data is stored in a DMatrix object 
#label is used to define our outcome variable
dtrain=xgb.DMatrix(x_train,label=y_train)
dtest=xgb.DMatrix(x_test)

In [None]:
#setting parameters for xgboost
parameters={'max_depth':7, 'eta':1, 'silent':1,'objective':'binary:logistic','eval_metric':'auc','learning_rate':.05}

In [None]:
#training our model 
num_round=50
from datetime import datetime 
start = datetime.now() 
xg=xgb.train(parameters,dtrain,num_round) 
stop = datetime.now()

In [None]:
#Execution time of the model 
execution_time_xgb = stop-start 
execution_time_xgb

In [None]:
#datetime.timedelta( , , ) representation => (days , seconds , microseconds) 
#now predicting our model on test set 
ypred=xg.predict(dtest) 
ypred

In [None]:
#Converting probabilities into 1 or 0  
for i in range(0,9769): 
    if ypred[i]>=.5:       # setting threshold to .5 
       ypred[i]=1 
    else: 
       ypred[i]=0  

In [None]:
#calculating accuracy of our model 
from sklearn.metrics import accuracy_score 
accuracy_xgb = accuracy_score(y_test,ypred) 
accuracy_xgb


In [None]:
train_data=lgb.Dataset(x_train,label=y_train)

In [None]:
#setting parameters for lightgbm
param = {'num_leaves':150, 'objective':'binary','max_depth':7,'learning_rate':.05,'max_bin':200}
param['metric'] = ['auc', 'binary_logloss']

In [None]:
#Here we have set max_depth in xgb and LightGBM to 7 to have a fair comparison between the two.
#training our model using light gbm
num_round=50
start=datetime.now()
lgbm=lgb.train(param,train_data,num_round)
stop=datetime.now()

In [None]:
#Execution time of the model
execution_time_lgbm = stop-start
execution_time_lgbm

In [None]:
#predicting on test set
ypred2=lgbm.predict(x_test)
ypred2[0:5]  # showing first 5 predictions

In [None]:
#converting probabilities into 0 or 1
for i in range(0,9769):
    if ypred2[i]>=.5:       # setting threshold to .5
       ypred2[i]=1
    else:  
       ypred2[i]=0

In [None]:
#calculating accuracy
accuracy_lgbm = accuracy_score(ypred2,y_test)
accuracy_lgbm
y_test.value_counts()

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
#calculating roc_auc_score for xgboost
auc_xgb =  roc_auc_score(y_test,ypred)
auc_xgb

In [None]:
#calculating roc_auc_score for light gbm. 
auc_lgbm = roc_auc_score(y_test,ypred2)
auc_lgbm comparison_dict = {'accuracy score':(accuracy_lgbm,accuracy_xgb),'auc score':(auc_lgbm,auc_xgb),'execution time':(execution_time_lgbm,execution_time_xgb)}

In [None]:
#Creating a dataframe ‘comparison_df’ for comparing the performance of Lightgbm and xgb. 
comparison_df = DataFrame(comparison_dict) 
comparison_df.index= ['LightGBM','xgboost'] 
comparison_df

In [None]:
!pip install sklearn-pandas

In [None]:
from IPython.display import display
from numpy.random import RandomState
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer, precision_recall_fscore_support, roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn_pandas import DataFrameMapper

import numpy as np
import pandas as pd

In [None]:
rs = RandomState(130917)

In [None]:
df = pd.read_csv("adult.data", header=None, delimiter=r"\s+")

In [None]:
print(df.head())

    0                  1        2           3    4                    5   \
0  39,         State-gov,   77516,  Bachelors,  13,       Never-married,   
1  50,  Self-emp-not-inc,   83311,  Bachelors,  13,  Married-civ-spouse,   
2  38,           Private,  215646,    HS-grad,   9,            Divorced,   
3  53,           Private,  234721,       11th,   7,  Married-civ-spouse,   
4  28,           Private,  338409,  Bachelors,  13,  Married-civ-spouse,   

                   6               7       8        9      10  11   12  \
0       Adm-clerical,  Not-in-family,  White,    Male,  2174,  0,  40,   
1    Exec-managerial,        Husband,  White,    Male,     0,  0,  13,   
2  Handlers-cleaners,  Not-in-family,  White,    Male,     0,  0,  40,   
3  Handlers-cleaners,        Husband,  Black,    Male,     0,  0,  40,   
4     Prof-specialty,           Wife,  Black,  Female,     0,  0,  40,   

               13     14  
0  United-States,  <=50K  
1  United-States,  <=50K  
2  United-States,

In [None]:
df.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]

In [None]:
df.isnull().values.any()

False

In [None]:
print(df.head())

   Age          WorkClass   fnlwgt   Education EducationNum  \
0  39,         State-gov,   77516,  Bachelors,          13,   
1  50,  Self-emp-not-inc,   83311,  Bachelors,          13,   
2  38,           Private,  215646,    HS-grad,           9,   
3  53,           Private,  234721,       11th,           7,   
4  28,           Private,  338409,  Bachelors,          13,   

         MaritalStatus          Occupation    Relationship    Race   Gender  \
0       Never-married,       Adm-clerical,  Not-in-family,  White,    Male,   
1  Married-civ-spouse,    Exec-managerial,        Husband,  White,    Male,   
2            Divorced,  Handlers-cleaners,  Not-in-family,  White,    Male,   
3  Married-civ-spouse,  Handlers-cleaners,        Husband,  Black,    Male,   
4  Married-civ-spouse,     Prof-specialty,           Wife,  Black,  Female,   

  CapitalGain CapitalLoss HoursPerWeek   NativeCountry Income  
0       2174,          0,          40,  United-States,  <=50K  
1          0,     

In [None]:
df.Income.unique()

array(['<=50K', '>50K'], dtype=object)

In [None]:
df["Income"] = df["Income"].map({ "<=50K": -1, ">50K": 1 })

In [None]:
y_all = df["Income"].values
df.drop("Income", axis=1, inplace=True,)

In [None]:
print(df.head())

   Age          WorkClass   fnlwgt   Education EducationNum  \
0  39,         State-gov,   77516,  Bachelors,          13,   
1  50,  Self-emp-not-inc,   83311,  Bachelors,          13,   
2  38,           Private,  215646,    HS-grad,           9,   
3  53,           Private,  234721,       11th,           7,   
4  28,           Private,  338409,  Bachelors,          13,   

         MaritalStatus          Occupation    Relationship    Race   Gender  \
0       Never-married,       Adm-clerical,  Not-in-family,  White,    Male,   
1  Married-civ-spouse,    Exec-managerial,        Husband,  White,    Male,   
2            Divorced,  Handlers-cleaners,  Not-in-family,  White,    Male,   
3  Married-civ-spouse,  Handlers-cleaners,        Husband,  Black,    Male,   
4  Married-civ-spouse,     Prof-specialty,           Wife,  Black,  Female,   

  CapitalGain CapitalLoss HoursPerWeek   NativeCountry  
0       2174,          0,          40,  United-States,  
1          0,          0,       

In [None]:
df.describe()

Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry
count,32561,32561,32561,32561,32561,32561,32561,32561,32561,32561,32561,32561,32561,32561
unique,73,9,21648,16,16,7,15,6,5,2,119,92,94,42
top,36,"Private,",203488,"HS-grad,",9,"Married-civ-spouse,","Prof-specialty,","Husband,","White,","Male,",0,0,40,"United-States,"
freq,898,22696,13,10501,10501,14976,4140,13193,27816,21790,29849,31042,15217,29170


In [None]:
df.CapitalGain.value_counts()

0,        29849
15024,      347
7688,       284
7298,       246
99999,      159
5178,        97
3103,        97
4386,        70
5013,        69
8614,        55
3325,        53
2174,        48
10520,       43
4064,        42
14084,       41
4650,        41
3137,        37
20051,       37
27828,       34
594,         34
3908,        32
2829,        31
6849,        27
13550,       27
14344,       26
1055,        25
2885,        24
3411,        24
4787,        23
3464,        23
          ...  
25124,        4
9562,         4
1086,         4
7896,         3
5721,         3
1173,         3
1424,         3
2961,         3
4687,         3
6360,         3
2009,         3
2936,         3
2993,         2
11678,        2
18481,        2
2062,         2
401,          2
3456,         2
41310,        2
6723,         2
6097,         1
2387,         1
1639,         1
2538,         1
5060,         1
7978,         1
4931,         1
1111,         1
1455,         1
22040,        1
Name: CapitalGain, Lengt

In [None]:
df.CapitalLoss.value_counts()

0,       31042
1902,      202
1977,      168
1887,      159
1485,       51
1848,       51
2415,       49
1602,       47
1740,       42
1590,       40
1876,       39
1672,       34
2258,       25
1564,       25
1669,       24
2001,       24
1741,       24
1980,       23
1719,       22
1408,       21
2051,       21
2002,       21
2377,       20
1579,       20
1974,       18
1504,       18
1721,       18
2339,       17
1628,       15
2179,       15
         ...  
2457,        3
4356,        3
2231,        3
3004,        2
1816,        2
1735,        2
2238,        2
1138,        2
2352,        2
1648,        2
3900,        2
810,         2
1755,        2
3683,        2
3770,        2
2754,        2
974,         2
2149,        2
2282,        1
1844,        1
2472,        1
2163,        1
2489,        1
2201,        1
155,         1
1539,        1
2467,        1
1411,        1
1944,        1
2080,        1
Name: CapitalLoss, Length: 92, dtype: int64

In [None]:
df.drop("CapitalGain", axis=1, inplace=True,)
df.drop("CapitalLoss", axis=1, inplace=True,)

In [None]:
df.Age = df.Age.astype(float)
df.fnlwgt = df.fnlwgt.astype(float)
df.EducationNum = df.EducationNum.astype(float)
df.HoursPerWeek = df.HoursPerWeek.astype(float)

In [None]:
df.WorkClass.unique()

array(['State-gov,', 'Self-emp-not-inc,', 'Private,', 'Federal-gov,',
       'Local-gov,', '?,', 'Self-emp-inc,', 'Without-pay,',
       'Never-worked,'], dtype=object)

In [None]:
df.Education.unique()

array(['Bachelors,', 'HS-grad,', '11th,', 'Masters,', '9th,',
       'Some-college,', 'Assoc-acdm,', 'Assoc-voc,', '7th-8th,',
       'Doctorate,', 'Prof-school,', '5th-6th,', '10th,', '1st-4th,',
       'Preschool,', '12th,'], dtype=object)

In [None]:
df.MaritalStatus.unique()

array(['Never-married,', 'Married-civ-spouse,', 'Divorced,',
       'Married-spouse-absent,', 'Separated,', 'Married-AF-spouse,',
       'Widowed,'], dtype=object)

In [None]:
df.Occupation.unique()

array(['Adm-clerical,', 'Exec-managerial,', 'Handlers-cleaners,',
       'Prof-specialty,', 'Other-service,', 'Sales,', 'Craft-repair,',
       'Transport-moving,', 'Farming-fishing,', 'Machine-op-inspct,',
       'Tech-support,', '?,', 'Protective-serv,', 'Armed-Forces,',
       'Priv-house-serv,'], dtype=object)

In [None]:
df.Relationship.unique()

array(['Not-in-family,', 'Husband,', 'Wife,', 'Own-child,', 'Unmarried,',
       'Other-relative,'], dtype=object)

In [None]:
len(df.NativeCountry.unique())

42

In [None]:
df = pd.get_dummies(df, columns=[
    "WorkClass", "Education", "MaritalStatus", "Occupation", "Relationship",
    "Race", "Gender", "NativeCountry",
])

In [None]:
df.shape

(32561, 106)

In [None]:
pd.value_counts(pd.Series(y_all))

-1    24720
 1     7841
dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df, y_all, test_size=0.25, stratify=y_all, random_state=rs,
)

In [None]:
standard_scaler_cols = ["Age", "fnlwgt", "EducationNum", "HoursPerWeek",]
other_cols = list(set(df.columns) - set(standard_scaler_cols))
mapper = DataFrameMapper(
    [([col,], StandardScaler(),) for col in standard_scaler_cols] +
    [(col, None,) for col in other_cols]
)

In [None]:
clf = LogisticRegression(random_state=rs,)
pipeline = Pipeline([
    ("scale", mapper,),
    ("logit", clf,)
])

In [None]:
strat_kfold = StratifiedKFold(10, random_state=rs,)
estimator = GridSearchCV(
    pipeline,
    param_grid={
        "logit__C": np.power(10, np.arange(-4.0, 5.0)),
        "logit__class_weight": ["balanced", None,],
    },
    scoring=make_scorer(roc_auc_score),
    cv=strat_kfold,
)

In [None]:
import warnings
warnings.filterwarnings("ignore")
estimator.fit(X_train, y_train)