
# Import Python libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

In [None]:
from xgboost import XGBClassifier

# Import Datasets

In [None]:
df = pd.read_csv(r"C:\Users\harsh\Training Data.csv",index_col=0)
test_data = pd.read_csv(r"C:\Users\harsh\Test Data.csv")

# Check how train dataset looks like.

In [None]:
df.head()

In [None]:
df.shape

In [None]:
print('0', round(df['risk_flag'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('1', round(df['risk_flag'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

In [None]:
df.risk_flag.value_counts()

# Convert datatype of selected fields.

In [None]:
df["profession"]=pd.factorize(df.profession)[0]
df["city"]=pd.factorize(df.city)[0]
df["state"]=pd.factorize(df.state)[0]
df["married"]=pd.factorize(df.married)[0]
df["house_ownership"]=pd.factorize(df.house_ownership)[0]
df["car_ownership"]=pd.factorize(df.car_ownership)[0]

In [None]:
test_data["profession"]=pd.factorize(test_data.profession)[0]
test_data["city"]=pd.factorize(test_data.city)[0]
test_data["state"]=pd.factorize(test_data.state)[0]
test_data["married"]=pd.factorize(test_data.married)[0]
test_data["house_ownership"]=pd.factorize(test_data.house_ownership)[0]
test_data["car_ownership"]=pd.factorize(test_data.car_ownership)[0]


In [None]:
y_test_id=test_data.id.values

In [None]:

test_data.drop(['id'],axis=1,inplace=True)

In [None]:
test_data.head()

In [None]:
test_data.shape

# Drop the dependent variable from the  dataset. 

In [None]:
df.columns

In [None]:
X=df.drop(['risk_flag'],axis=1)
Y=df['risk_flag']

##  Splitting data into train and test data for model

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2,random_state=121)

In [None]:
X_train.head()

## Synthetic Minority Oversampling Technique (SMOTE)

In [None]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss
smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_resample(X_train,Y_train)

In [None]:
y_res.value_counts()

In [None]:
import joblib 
joblib.dump({'X_res':X_res,'y_res':y_res,'X_test':X_test,'Y_test':Y_test,
            'X_train':X_train,'Y_train':Y_train},
           "data_processed_mainmodel.pkl"
           )

In [None]:
rs=122

# Train your model

### CAT BOOST CLASSIFIER

In [None]:
categorical_features_indices = np.where(X.dtypes != np.float)[0]

## HyperParameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
mod= CatBoostClassifier()

par={'max_depth':[3,5,10,None],
              'n_estimators':[100,200,300,400],'learning_rate':[0.1,0.01,0.001]}
def hyperparameter_tuning(mod,param_d,p,q):
    rdmsearch=  RandomizedSearchCV(mod, param_distributions=param_d,n_jobs=-1,cv=9,scoring='roc_auc')
    rdmsearch.fit(p,q)
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score


rf_parameters, rf_ht_score = hyperparameter_tuning(mod, par,  X_res, y_res)


In [None]:
print(rf_parameters, rf_ht_score)

### Running the model with tuned parameters

In [None]:
from catboost import  CatBoostClassifier
model=  CatBoostClassifier(random_state=121,n_estimators=400,max_depth=10,learning_rate= 0.1)
model.fit(X_res, y_res,cat_features=categorical_features_indices,eval_set=(X_test, Y_test))

In [None]:
y_pred=model.predict(X_res)

In [None]:
y_pred_ht=model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report , confusion_matrix 
print(classification_report(Y_test,y_pred_ht))

In [None]:
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_res, y_pred)
print('ROC AUC: %f' % auc)

In [None]:
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(Y_test, y_pred_ht)
print('ROC AUC: %f' % auc)

# XGBOOST

In [None]:
xgb = XGBClassifier(verbosity=0,objective = "binary:logistic",eval_metrix='Log_Loss')
xgb.fit(X_res,y_res)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
y_pred_train=xgb.predict(X_res)
auc = roc_auc_score(y_res ,y_pred_train)
print('ROC AUC: %f' % auc)

In [None]:
y_pred_xgb=xgb.predict(X_test)
auc = roc_auc_score(Y_test ,y_pred_xgb)
print('ROC AUC: %f' % auc)

## Hyperparameter tuning Xgboost

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [None]:
params={
 "learning_rate"    : [0.05, 0.10 ,.01] ,
 "max_depth"        : [  4, 5, 6, 8],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}
classifier=XGBClassifier()

def hyperparameter_tunin(mod,param_d,p,q):
    rdmsearch=  RandomizedSearchCV(mod, param_distributions=param_d,n_iter=5,n_jobs=-1,cv=9,scoring='roc_auc')
    rdmsearch.fit(p,q)
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score


rf_parameters, rf_ht_score = hyperparameter_tunin(classifier, params,  X_res, y_res)

In [None]:
print(rf_parameters, rf_ht_score)

In [None]:
xgb = XGBClassifier(verbosity=0,
                    objective = "binary:logistic",
                    eval_metrix='Log_Loss',
                    max_child_weight=3,
                    max_depth=8,
                    learning_rate=0.1,
                    gamma=.0,
                    colsample_bytree=0.5,
                    random_state=121
                   )

In [None]:
xgb.fit(X_res,y_res)
y_pred_train_xgb=xgb.predict(X_res)

In [None]:
auc = roc_auc_score(y_res ,y_pred_train_xgb)
print('ROC AUC: %f' % auc)

In [None]:
y_pred_test_xgb=xgb.predict(X_test)


In [None]:
y_pred_xgb=xgb.predict(X_test)
auc = roc_auc_score(Y_test ,y_pred_test_xgb)
print('ROC AUC: %f' % auc)