In [None]:
import pandas as pd  #for data manipulation operations
import numpy as np  #for numeric operations on data
import seaborn as sns  #for data visualization operations
import matplotlib.pyplot as plt  #for data visualization operations
from sklearn.preprocessing import LabelEncoder # for encoding
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler #for standardization
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from termcolor import colored
import scipy.stats as st
from collections import Counter
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
# from mlxtend.classifier import StackingClassifier, StackingCVClassifier
from prettytable import PrettyTable

# to ignore warnings
import warnings
warnings.filterwarnings("ignore")

#to see model hyperparameters
from sklearn import set_config
set_config(print_changed_only = False)

# to show all columns
pd.set_option('display.max_columns', 30)

print(colored("\nLIBRARIES WERE SUCCESFULLY IMPORTED...", color = "green", attrs = ["dark", "bold"]))

In [None]:
smoking = pd.read_csv("../data/Smoking_raw/smoking.csv")
df = smoking.drop("ID", axis = 1)
df.head(n = 10).style.background_gradient(cmap = "Oranges")

In [None]:
# # BMI 지수 계산
# # bmi = kg/m^2
# df['bmi'] = df['weight_kg'] / ((df['height_cm'] * 0.01) ** 2)
# df.head(n = 10).style.background_gradient(cmap = "Oranges")

In [None]:
df.rename(columns = {"height(cm)" : "height_cm", "weight(kg)" : "weight_kg",
                     "waist(cm)" : "waist_cm", "eyesight(left)" : "eyesight_left",
                     "eyesight(right)" : "eyesight_right", "hearing(left)" : "hearing_left",
                     "hearing(right)" : "hearing_right", "fasting blood sugar" : "fasting_blood_sugar",
                     "Cholesterol" : "cholesterol", "HDL" : "hdl", "LDL" : "ldl",
                     "Urine protein" : "urine_protein", "serum creatinine" : "serum_creatinine",
                     "AST" : "ast", "ALT" : "alt", "Gtp" : "gtp", "dental caries" : "dental_caries"},
          inplace = True)

print(colored("\nTHE COLUMNS OF DATASET WERE SUCCESFULLY RENAMED...", color = "green", attrs = ["dark", "bold"]))

In [None]:
df.info()

In [None]:
print("\nThere are totally {} null values in the dataset".format(df.isnull().sum().sum()))

In [None]:
df.describe().T.style.background_gradient(cmap = "coolwarm_r")

In [None]:
df.corr().style.background_gradient(cmap = "magma")

In [None]:
plt.figure(figsize = [20, 10], clear = True, facecolor = "white")
sns.heatmap(df.corr(), annot = True, square = False, linewidths = 3,
            linecolor = "white", cmap = "Set2");

In [None]:
plt.figure(figsize = [8, 8], clear = True, facecolor = "#ABB2B9")
df["smoking"].value_counts().plot.pie(explode = [0, 0.15], autopct='%1.3f%%', shadow = True);

In [None]:
plt.figure(figsize = [8, 8], clear = True, facecolor = "#ABB2B9")
df["gender"].value_counts().plot.pie(explode = [0, 0.15], autopct='%1.2f%%', shadow = True);


In [None]:
fig, axes = plt.subplots(2, 2, figsize = (12, 12))
axes = axes.flatten()

sns.barplot(ax = axes[0],
            x = df["gender"].value_counts().index,
            y = df["gender"].value_counts(),
            data = df, saturation = 1).set(title = "Frequency of classes of the 'gender' variable");

sns.barplot(ax = axes[1],
            x = df["tartar"].value_counts().index,
            y = df["tartar"].value_counts(),
            data = df, saturation = 1).set(title = "Frequency of cases of tartar");

sns.barplot(ax = axes[2],
            x = df["dental_caries"].value_counts().index,
            y = df["dental_caries"].value_counts(),
            data = df, saturation = 1).set(title = "Frequency of cases of dental caries");

sns.barplot(ax = axes[3],
            x = df["smoking"].value_counts().index,
            y = df["smoking"].value_counts(),
            data = df, saturation = 1).set(title = "Frequency of classes of the 'smoking' variable")

In [None]:
sns.catplot(x = "gender",
            y = "cholesterol",
            kind = "box",
            hue = "smoking",
            color = '#F83419',
            data = df, saturation = 1, height = 7, aspect = 1.3,
            margin_titles = True).set(title = "cholesterol level by gender and smoking");

In [None]:
sns.catplot(x = "gender",
            y = "weight_kg",
            hue = "smoking",
            kind = "boxen",
            color = '#B3EE22',
            data = df, saturation = 1, height = 7, aspect = 1.3,
            margin_titles = True).set(title = "weight by gender and smoking");

In [None]:
sns.catplot(x = "gender",
            y = "age",
            hue = "smoking",
            kind = "boxen",
            color = '#468A85',
            data = df, saturation = 1, height = 7, aspect = 1.3,
            margin_titles = True).set(title = "age by gender (male and female) and smoking");


In [None]:
sns.catplot(x = "gender",
            y = "relaxation",
            hue = "smoking",
            kind = "violin",
            color = '#FB2604',
            data = df, saturation = 1, height = 7, aspect = 1.35,
            margin_titles = True).set(title = "relaxation by gender and smoking");


In [None]:
g = sns.catplot(x = "gender", y = "serum_creatinine", col = "smoking", 
                hue = "dental_caries",
                data = df,
                saturation = 1,
                kind = "bar",
                ci = "sd",
                aspect = 0.99)

(g.set_axis_labels("", "serum creatinine").set_xticklabels(["male", "female"])
  .set_titles("{col_name} {col_var}").despine(left = True));  

In [None]:
fig, axes = plt.subplots(2, 3, figsize = (20, 12))
axes = axes.flatten()

sns.scatterplot(ax = axes[0], x = "weight_kg", y = "hemoglobin",
                hue = "smoking", size = "gender", sizes=(20, 100), legend="full",
                data = df).set(title = "Relationship between 'weight_kg' and 'hemoglobin'");

sns.scatterplot(ax = axes[1], x = "weight_kg", y = "cholesterol", hue = "smoking",
                size = "gender", sizes=(20, 100), legend="full",
                data = df).set(title = "Relationship between 'weight_kg' and 'cholesterol'");

sns.scatterplot(ax = axes[2], x = "weight_kg", y = "urine_protein", hue = "smoking",
                size = "gender", sizes=(20, 100), legend="full",
                data = df).set(title = "Relationship between 'weight_kg' and 'urine_protein'");

sns.scatterplot(ax = axes[3], x = "weight_kg", y = "serum_creatinine", hue = "smoking",
                size = "gender", sizes=(20, 100), legend="full",
                data = df).set(title = "Relationship between 'weight_kg' and 'serum_creatinine'");

sns.scatterplot(ax = axes[4], x = "weight_kg", y = "age", hue = "smoking",
                size = "gender", sizes=(20, 100), legend="full",
                data = df).set(title = "Relationship between 'weight_kg' and 'age'");

sns.scatterplot(ax = axes[5], x = "weight_kg", y = "fasting_blood_sugar", hue = "smoking",
                size = "gender", sizes=(20, 100), legend="full",
                data = df).set(title = "Relationship between 'weight_kg' and 'fasting_blood_sugar'");

In [None]:
df.hist(figsize = (20, 20), bins = 12, legend = False);

In [None]:
df.head(n = 10).style.background_gradient(cmap = "Oranges")

In [None]:
def outlier_detection(df, n, columns):
    rows = []
    will_drop_train = []
    for col in columns:
        Q1 = np.nanpercentile(df[col], 25)
        Q3 = np.nanpercentile(df[col], 75)
        IQR = Q3 - Q1
        outlier_point = 1.5 * IQR
        rows.extend(df[(df[col] < Q1 - outlier_point)|(df[col] > Q3 + outlier_point)].index)
    for r, c in Counter(rows).items():
        if c >= n: will_drop_train.append(r)
    return will_drop_train

will_drop_train = outlier_detection(df, 5, df.select_dtypes(["float", "int"]).columns)
will_drop_train[0:5]

In [None]:
lbe = LabelEncoder()
lbe.fit_transform(df["gender"])
df["gender"] = lbe.fit_transform(df["gender"])

In [None]:
lbe = LabelEncoder()
lbe.fit_transform(df["tartar"])
df["tartar"] = lbe.fit_transform(df["tartar"])

In [None]:
lbe = LabelEncoder()
lbe.fit_transform(df["oral"])
df["oral"] = lbe.fit_transform(df["oral"])

In [None]:
# select dependent variable (label)
y = df["smoking"]

# select independent variable (estimator)
x = df.drop("smoking", axis = 1)

In [None]:
x.info()

In [None]:
y

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.10,
                                                    shuffle = True,
                                                    random_state = 1)

In [None]:
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
x_train

In [None]:
x_test

In [None]:
import time
start_time = time.time()
xgb_model = XGBClassifier(n_estimators = 2000, learning_rate=0.3, max_depth=15)
xgb_model.fit(x_train, y_train)
print("XGB 수행 시간: {0:.1f} 초 ".format(time.time() - start_time))

In [None]:
# learning_rate = 0.3(default), max_depth=6(default) -> 21.8 sec
y_pred = xgb_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# learning_rate = 0.2 -> 21.6 sec
y_pred = xgb_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# learning_rate = 0.1 -> 21.4 sec
y_pred = xgb_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# learning_rate = 0.05 -> 23.3 sec
y_pred = xgb_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# learning_rate = 0.4 -> 21.5 sec
y_pred = xgb_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# max_depth = 3 -> 10.3 sec
y_pred = xgb_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# max_depth = 5 -> 17.9 sec
y_pred = xgb_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# max_depth = 7 -> 25.9 sec
y_pred = xgb_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# max_depth = 9 -> 33.6 sec
y_pred = xgb_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# max_depth = 10 -> 35.9 sec
y_pred = xgb_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# max_depth = 0 -> 2.1 sec
y_pred = xgb_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# max_depth = 15 -> 50.0 sec
y_pred = xgb_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [None]:
preds = [1 if x > 0.5 else 0 for x in y_pred]
print(np.round(preds[:10]))

In [None]:
get_clf_eval(y_test, preds, y_pred)

In [None]:
import matplotlib.pyplot as plt
import xgboost
%matplotlib inline
fig, ax = plt.subplots(figsize=(10,12)) # 축 반환
xgboost.plot_importance(xgb_model, ax=ax)