In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/beginners-classification-dataset/classification.csv


I would practice XGBoost with new data. 
Data source: https://www.kaggle.com/sveneschlbeck/beginners-classification-dataset

In [2]:
df = pd.read_csv("/kaggle/input/beginners-classification-dataset/classification.csv")

In [3]:
df.head() # only three variables. 
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       297 non-null    float64
 1   interest  297 non-null    float64
 2   success   297 non-null    float64
dtypes: float64(3)
memory usage: 7.1 KB


In [4]:
df.shape # 297 rows, 3 columns (small size)

(297, 3)

In [5]:
df.isnull().sum() # no missing

age         0
interest    0
success     0
dtype: int64

In [6]:
df[df.duplicated()] # no duplicated values. 

Unnamed: 0,age,interest,success


In [7]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlesize=14, titlepad=10)

In [8]:
px.histogram(df, x="age", color="success", marginal = "box")

Interestingly, as one is younger, success rate seems to be lower. 

In [9]:
px.histogram(df, x="interest", color="success", marginal='box')

Interest seems to be more influential factor! 

### 0. Split data into train & test. 
- considering the small datasize, I divided data with the ratio of 8:2 (8 for train, 2 for test). 

In [10]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

df = shuffle(df)
X = df.drop('success', axis=1).values
y = df['success'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

### 1. Logistic Regression
Let's use logistic regression as the baseline. 

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score, auc, roc_curve, accuracy_score

# make get_score function
def get_scores(y, y_pred):
    data={'Accuracy': np.round(accuracy_score(y, y_pred),2),
    'Precision':np.round(precision_score(y, y_pred),2),
    'Recall':np.round(recall_score(y, y_pred),2),
    'F1':np.round(f1_score(y, y_pred),2),
    'ROC AUC':np.round(roc_auc_score(y, y_pred),2)}
    scores_df = pd.Series(data).to_frame(' ').transpose()
    return scores_df

In [12]:
model1 = LogisticRegression()
pred = cross_val_predict(model1, X_train, y_train, cv=5)
get_scores(y_train, pred)

Unnamed: 0,Accuracy,Precision,Recall,F1,ROC AUC
,0.86,0.91,0.84,0.87,0.86


For logistic regression, accuracy = 0.88, precision=0.92, recall=0.86, F1=0.89, roc-auc=0.88. 

### 2. XGBoost

In [13]:
import xgboost as xgb
from xgboost import plot_importance
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV


In [14]:
xgb_model = xgb.XGBClassifier()

param = {'nthread':[4], 
         'objective':['binary:logistic'],
         'learning_rate': [0.03, 0.05, 0.07], 
         'max_depth': [6, 8, 10],
         'min_child_weight': [7, 9, 11],
         'subsample': [0.5, 0.7, 0.9],
         'colsample_bytree': [0.7, 0.9, 1.0],
         'n_estimators': [100],
         'seed': [122]}
clf = GridSearchCV(xgb_model, param_grid = param, scoring = 'accuracy', refit = True, verbose = 0)


In [15]:
xgbfit = clf.fit(X_train, y_train)



In [16]:
clf.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 6,
 'min_child_weight': 11,
 'n_estimators': 100,
 'nthread': 4,
 'objective': 'binary:logistic',
 'seed': 122,
 'subsample': 0.5}

In [17]:
pred2 = clf.predict(X_train)

In [18]:
get_scores(y_train, pred2) # great result. 

Unnamed: 0,Accuracy,Precision,Recall,F1,ROC AUC
,0.89,0.94,0.86,0.9,0.89


Let's test the result using test dataset. 

In [19]:
# logistic regression
model1 = LogisticRegression().fit(X_train, y_train)
pred = model1.predict(X_test)
get_scores(y_test, pred)

Unnamed: 0,Accuracy,Precision,Recall,F1,ROC AUC
,0.9,0.9,0.9,0.9,0.9


In [20]:
# XGBoost
pred2 = clf.predict(X_test)
get_scores(y_test, pred2)

Unnamed: 0,Accuracy,Precision,Recall,F1,ROC AUC
,0.87,0.9,0.84,0.87,0.87


When I test the result, the recall rate of tuned XGBoost was much worser than basic logistic regression. 
Other indices were slightly better with tuned XGBoost. 