In [1]:
# 1. Run time for original code:
import time 
start = time.time()

In [2]:
# 2. Reading Libraries
import os
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import math
import gc
import pprint

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [3]:
# XGBOOST
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [4]:
from pathlib import Path
input_path = Path('./data/')

In [5]:
# Load Data
train_data = pd.read_csv('./data/train_data.csv').groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()

In [6]:
labels = pd.read_csv('./data/train_labels.csv').set_index('customer_ID', drop=True)

In [7]:
categorical_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_68']

label_encoder = LabelEncoder()

for feature in categorical_features:
    train_data[feature] = label_encoder.fit_transform(train_data[feature])

In [8]:
#Data process
#remove the columns which has lots of missing data
null_percentages = (train_data.isnull().sum() / len(train_data)) * 100
high_null_cols = [col for col in null_percentages.index if null_percentages[col] > 80]
train_data = train_data.drop(high_null_cols, axis=1)

In [9]:
#delete s2 feature
train_data = train_data.drop(['S_2'], axis =1)

In [10]:
categories = [col for col in train_data.columns if train_data[col].dtype == 'category']

In [11]:
#fill null data
for i in categories:
    train_data[i] =  train_data[i].fillna(train_data[i].mode()[0])

In [12]:
null_columns = train_data.columns[train_data.isna().any()].tolist()
for j in null_columns:
     train_data[j] = train_data[j].fillna(train_data[j].median())

In [13]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
train_data[categories] = enc.fit_transform(train_data[categories])

In [14]:
full_train = train_data.merge(labels, how='inner', left_index=True, right_index=True)

In [15]:
x = full_train.drop('target', axis=1)
y = full_train['target']

In [16]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.22, random_state=42, stratify=y)

In [17]:
#Random Forest Classifier
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [18]:
parameter_space = {
        "n_estimators": [20], 
        "min_samples_leaf": [31],
        "min_samples_split": [2],
        "max_depth": [10],
        "max_features": [40]
    }
clf = RandomForestRegressor(
        criterion="squared_error",
        n_jobs=-1,
        random_state=22)

In [19]:
#XG boost model
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.1).fit(x_train, y_train)

In [20]:
predict= model.predict(x_test)
accuracy = accuracy_score(y_test, predict)
print("Accuracy:", accuracy)

# Evaluate performance
from sklearn.metrics import classification_report
print(classification_report(y_test, predict))

Accuracy: 0.9005457552916473
              precision    recall  f1-score   support

           0       0.93      0.94      0.93     74819
           1       0.81      0.80      0.81     26142

    accuracy                           0.90    100961
   macro avg       0.87      0.87      0.87    100961
weighted avg       0.90      0.90      0.90    100961



In [21]:
finish = time.time()
total_time = finish - start

In [22]:
print(total_time)

840.3021399974823
