In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
testpath = "/kaggle/input/home-credit-default-risk/application_test.csv"
trainpath = "/kaggle/input/home-credit-default-risk/application_train.csv"

In [None]:
traindata = pd.read_csv(trainpath)
traindata

In [None]:
testdata = pd.read_csv(testpath)
testdata

EDA

In [None]:
traindata.columns

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(traindata.dtypes)

In [None]:
traindata["SK_ID_CURR"].dtype == "float"

In [None]:
fig, ax = plt.subplots(40,3)
fig.set_size_inches(60,350)
for i, col in enumerate(traindata.columns[2:]):
    ax[i//3, i%3].set_title(col)
    if traindata[col].dtype == "int" or traindata[col].dtype == "float":
        ax[i//3, i%3].hist(traindata[col])
    else:
        traindata[col].value_counts().plot(kind="barh", ax=ax[i//3, i%3])
#         print(col)

In [None]:
from matplotlib.pyplot import figure

figure(figsize=(75, 81))
sns.heatmap(traindata.iloc[:, 1:].corr(), annot=True, cmap = 'Reds')

In [None]:
correlations = traindata.iloc[:, 1:].corr()

Choosing and deleting strongly correlated features

In [None]:
colarray = []
for i in range(len(correlations)):
    for j in range(i + 1, len(correlations)):
        if correlations.iloc[i,j] > 0.9:
            colarray.append(j)
colarray

In [None]:
colarray = list(set(colarray))
colarray

In [None]:
colstodelete = list(correlations.columns[colarray])
colstodelete

In [None]:
colstodelete.append("SK_ID_CURR")

In [None]:
testids = testdata["SK_ID_CURR"]
testids

In [None]:
traindata = traindata.drop(colstodelete, axis=1)
testdata = testdata.drop(colstodelete, axis=1)

Missing values

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(traindata.isnull().sum())

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(testdata.isnull().sum())

In [None]:
catcols = []

In [None]:
for i, c in enumerate(traindata.columns):
    nullcount = traindata.loc[:,c].isnull().sum()
    if nullcount > 100000:
        if traindata[c].dtype == "O":
            traindata[c] = traindata[c].fillna("na")
            testdata[c] = testdata[c].fillna("na")
            catcols.append(c)
        else:
            traindata = traindata.drop(c, axis = 1)
            testdata = testdata.drop(c, axis = 1)
    else:
        if traindata[c].dtype == "O":
            traindata[c] = traindata[c].fillna(method='ffill').fillna(traindata[c].mode())
            testdata[c] = testdata[c].fillna(method='ffill').fillna(traindata[c].mode())
            catcols.append(c)
        else:
            traindata[c] = traindata[c].fillna(traindata[c].median())
            if c in testdata.columns:
                testdata[c] = testdata[c].fillna(traindata[c].median())

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(traindata.describe(include='all'))

One-hot encoding of categorical features

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder(handle_unknown='ignore', drop='if_binary')
newcatcols = []
for c in catcols:
    unique = traindata[c].unique()
    if (len(unique)<=2):
        newcatcols.append(c+"_enc")
    else:
        for u in unique:
            newcatcols.append(c + "_" + u)
traindata[newcatcols] = pd.DataFrame(enc.fit_transform(traindata[catcols]).toarray(), columns=newcatcols)
testdata[newcatcols] = pd.DataFrame(enc.transform(testdata[catcols]).toarray(), columns=newcatcols)

In [None]:
pd.options.display.max_columns = None
traindata.head()

In [None]:
traindata = traindata.drop(catcols, axis=1)
testdata = testdata.drop(catcols, axis=1)

In [None]:
traindata.head()

Normilizing features

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
for c in traindata.columns:
    if c not in newcatcols and c != "TARGET":
        scaler = StandardScaler()
        scaler = scaler.fit(traindata[c].values.reshape(-1, 1))
        traindata[c] = scaler.transform(traindata[c].values.reshape(-1, 1))
        testdata[c] = scaler.transform(testdata[c].values.reshape(-1, 1))

Sets split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
traindata, valdata = train_test_split(traindata, test_size=0.05)

In [None]:
X_train, y_train = traindata.drop("TARGET", axis=1), traindata["TARGET"]
X_val, y_val = valdata.drop("TARGET", axis=1), valdata["TARGET"]
X_test = testdata

Use Decision tree to predict probabilities for validation set. Find and use roc_auc_score metric to evaluate this decision

In [None]:
from sklearn import tree
# clf = tree.DecisionTreeClassifier()
# clf = clf.fit(X_train, y_train)
# y_val_pred = clf.predict_proba(X_val)[:, 1]

In [None]:
from sklearn.metrics import roc_auc_score
# roc_auc_score(y_val, y_val_pred)

In [None]:
from sklearn.linear_model import LogisticRegression
# clf = LogisticRegression(solver="liblinear")
# clf = clf.fit(X_train, y_train)
# y_val_pred = clf.predict_proba(X_val)[:, 1]
# roc_auc_score(y_val, y_val_pred)

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifiers = [
    ("Tree", tree.DecisionTreeClassifier()), 
    ("Log regression", LogisticRegression(solver="liblinear")),
#     ("SVC lin", SVC(gamma='auto', kernel = 'linear')),
#     ("SVC poly", SVC(gamma='auto', kernel = 'poly')),
#     ("SVC rbf", SVC(gamma='auto', kernel = 'rbf')),
    ("SGD", SGDClassifier(penalty="l2", loss="log")),
    ("KNN", KNeighborsClassifier(n_neighbors=3)),
    #("GPC", GaussianProcessClassifier(kernel=RBF(1.0))),
    ("GNB", GaussianNB()),
    ("Ada", AdaBoostClassifier(n_estimators=100)),
    ("RandFor", RandomForestClassifier(n_estimators=10))
]

In [None]:
for classname, classifier in classifiers:
    classifier = classifier.fit(X_train[:40000], y_train[:40000])
    y_train_pred = classifier.predict_proba(X_train[:40000])[:,1]
    y_val_pred = classifier.predict_proba(X_val)[:,1]
    print(classname + ": Train score = " 
          + str(roc_auc_score(y_train[:40000], y_train_pred).round(5)) + ", val score = " 
          + str(roc_auc_score(y_val, y_val_pred).round(5)))
    

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform
from scipy.stats import uniform

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")

In [None]:
clf = LogisticRegression(max_iter=300)

param_dist = {
    "penalty": ["l1", "l2", "elasticnet"],
    "C": loguniform(1e-3, 1e1),
    "solver": ["liblinear", "saga"]
}

random_search = RandomizedSearchCV(
    clf, param_distributions=param_dist, n_iter=20, scoring="roc_auc"
)

random_search.fit(X_train, y_train)

report(random_search.cv_results_)

In [None]:
clf = AdaBoostClassifier()

param_dist = {
    "n_estimators": list(range(40, 200)),
    "learning_rate": loguniform(0.01, 1.0),
    "algorithm": ['SAMME', 'SAMME.R']
}

random_search = RandomizedSearchCV(
    clf, param_distributions=param_dist, n_iter=10, scoring="roc_auc"
)

random_search.fit(X_train, y_train)

report(random_search.cv_results_)

In [None]:
clf = SGDClassifier()

param_dist = {
    "loss": ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    "penalty":['l2', 'l1', 'elasticnet'],
    "alpha": loguniform(1e-6, 1e-2),
    "l1_ratio": loguniform(0.1, 0.3),
    "epsilon":loguniform(1e-2, 0.3),
    "learning_rate": ['optimal','invscaling', 'adaptive'],
    "eta0": loguniform(1e-4, 1.0)
}

random_search = RandomizedSearchCV(
    clf, param_distributions=param_dist, n_iter=20, scoring="roc_auc"
)

random_search.fit(X_train, y_train)

report(random_search.cv_results_)

In [None]:
classifier = AdaBoostClassifier(n_estimators= 101, learning_rate= 0.82792, algorithm= 'SAMME.R')
classifier = classifier.fit(X_train, y_train)
y_train_pred = classifier.predict_proba(X_train)[:,1]
y_val_pred = classifier.predict_proba(X_val)[:,1]
print("Train score = " 
      + str(roc_auc_score(y_train, y_train_pred).round(5)) + ", val score = " 
      + str(roc_auc_score(y_val, y_val_pred).round(5)))

In [None]:
y_test_pred = classifier.predict_proba(X_test)[:,1]
y_test_pred

In [None]:
result = pd.DataFrame({"SK_ID_CURR": testids, "TARGET": y_test_pred})
result

In [None]:
result.to_csv("/kaggle/working/ada1.csv", index=False)

My test score: 0.72560