# [Problem_1] Cross Validation

In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

df = pd.read_csv('/application_train.csv')
df = df.select_dtypes('number')

cleaned_df = df.fillna(0)
cleaned_df = cleaned_df[cleaned_df.columns[~cleaned_df.isnull().all()]]

y = cleaned_df['TARGET']
X = cleaned_df.drop(['TARGET'], axis=1)

X = X.to_numpy()
kf = KFold(n_splits=2)

for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index],X[test_index]
    y_train, y_test = y[train_index],y[test_index]
    

TRAIN: [15541 15542 15543 ... 31078 31079 31080] TEST: [    0     1     2 ... 15538 15539 15540]
TRAIN: [    0     1     2 ... 15538 15539 15540] TEST: [15541 15542 15543 ... 31078 31079 31080]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [16]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_trans = scaler.transform(X_train)
X_test_trans = scaler.transform(X_test)

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

model_params = {
    'random_forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators': [1,5,10]
        }
    },
    'logic_regression':{
        'model': LogisticRegression(solver="liblinear",multi_class="auto"),
        'params': {
            'C': [1,5,10]
        }
    }
}
scores = []

for model_name,mp in model_params.items():
    clf = GridSearchCV(mp['model'],mp['params'], return_train_score=False)
    clf.fit(X_train_trans,y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
best_model_params = pd.DataFrame(scores,columns=['model','best_score','best_params'])
best_model_params

Unnamed: 0,model,best_score,best_params
0,random_forest,0.854898,{'n_estimators': 5}
1,logic_regression,0.920983,{'C': 1}


# [Problem 3] Survey from Kaggle Notebooks

1. Hyperparameter Tuning using Grid search
2. Gradient Boosting Machine
3. Using one type of data
4. Early stopping

# [Problem 4] Creating a model with high generalization performance

In [23]:
import lightgbm as lgb

model = lgb.LGBMClassifier()

default_params = model.get_params()

N_FOLDS = 5

train_set = lgb.Dataset(data = X_train)

cv_results = lgb.cv(default_params, train_set, num_boost_round = 10000, early_stopping_rounds = 100, metrics = 'auc', nfold = N_FOLDS, seed = 42)

print('The maximum validation ROC AUC wasa: {:.5f}.'.format(cv_results['auc-mean'][-1]))
print('The optional number of boosting rounds (estimators) was {}.'.format(len(cv_results['auc-mean'])))

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


The maximum validation ROC AUC wasa: 1.00000.
The optional number of boosting rounds (estimators) was 1.


# [Problem 5] Final model selection

In [19]:
test_df = pd.read_csv('application_test.csv')

test_cleaned_df = test_df.fillna(0)

test_X = test_cleaned_df.select_dtypes('number')

test_scaler = StandardScaler()
test_X_test_trans = scaler.fit_transform(test_X)

# est_reg_pred = clf.predict(test_X_test_trans = scaler.fit_transform(test_X)

test_reg_pred = clf.predict(test_X_test_trans)

kgl_submission = pd.concat([test_df['SK_ID_CURR'], pd.Series(test_reg_pred, name='TARGET')], axis=1)
kgl_submission.to_csv('kggl_submission.csv', index=False)
                            