In [128]:
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
import lightgbm as lgb
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as style
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, classification_report, confusion_matrix

In [129]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sub = pd.read_csv("data/sample_submission.csv")

In [130]:
profile = ProfileReport(train, title="ML Olympiad - Good Health and Well Being")
#profile.to_notebook_iframe()

In [131]:
cols = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'Diabetes', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'DiffWalk', 'Sex']
train[cols] = train[cols].astype('category')
test[cols] = test[cols].astype('category')
test = test.drop('PatientID', axis=1)

In [132]:
X = train.drop(['PatientID', 'target'], axis=1)
y = train['target']

In [133]:
num_transformer = Pipeline([
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
       ("encoder", OneHotEncoder())
])

#preprocessor = ColumnTransformer([
#    ("num", num_transformer, selector(dtype_exclude="category")),
#    ("cat", cat_transformer, selector(dtype_include="category")),
#])

# selecting numeric and categorical features
cat_cols = X.select_dtypes(include="category")
num_cols = X.select_dtypes(exclude="category")

# perform the feature engineering
preprocessor = ColumnTransformer([
    ("num", num_transformer, list(num_cols.columns)),
    ("cat", cat_transformer, list(cat_cols.columns)),
])

In [134]:
X = preprocessor.fit_transform(X)
X_test = preprocessor.fit_transform(test)
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.25, random_state=0)

In [135]:
#classifier = lgb.LGBMClassifier(random_state=42)
skf = StratifiedKFold(n_splits=10, random_state=92, shuffle=True)

In [136]:
clf = lgb.LGBMClassifier()

In [137]:
model_params = {
    'num_leaves':[20,40,60,80,100], 
    'min_child_samples':[5,10,15],
    'max_depth':[-1,5,10,20],
    'learning_rate':[0.05,0.1,0.2],
    'reg_alpha':[0,0.01,0.03]
    }

rsearchcv = RandomizedSearchCV(clf, model_params, n_iter=100, random_state=1)

In [111]:
rsearchcv.fit(X_train, y_train)

RandomizedSearchCV(estimator=LGBMClassifier(), n_iter=100,
                   param_distributions={'learning_rate': [0.05, 0.1, 0.2],
                                        'max_depth': [-1, 5, 10, 20],
                                        'min_child_samples': [5, 10, 15],
                                        'num_leaves': [20, 40, 60, 80, 100],
                                        'reg_alpha': [0, 0.01, 0.03]},
                   random_state=1)

In [115]:
print('Training accuracy {:.2f}'.format(rsearchcv.score(X_train, y_train)))

Training accuracy 0.91


In [112]:
pred = rsearchcv.predict(X_test)

In [113]:
submission = pd.DataFrame({'PatientID':sub["PatientID"], 'target':pred}, index=None)

In [114]:
submission.to_csv("submission_1.csv", index=False)