In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install creditHome

In [None]:
from creditHome.eda1 import *

In [None]:
def grab_col_names1(dataframe, cat_th=10, car_th=20):
    """

    Veri setindeki kategorik, numerik ve kategorik fakat kardinal değişkenlerin isimlerini verir.
    Not: Kategorik değişkenlerin içerisine numerik görünümlü kategorik değişkenler de dahildir.

    Parameters
    ------
        dataframe: dataframe
                Değişken isimleri alınmak istenilen dataframe
        cat_th: int, optional
                numerik fakat kategorik olan değişkenler için sınıf eşik değeri
        car_th: int, optinal
                kategorik fakat kardinal değişkenler için sınıf eşik değeri

    Returns
    ------
        cat_cols: list
                Kategorik değişken listesi
        num_cols: list
                Numerik değişken listesi
        cat_but_car: list
                Kategorik görünümlü kardinal değişken listesi

    Examples
    ------
        import seaborn as sns
        df = sns.load_dataset("iris")
        print(grab_col_names(df))


    Notes
    ------
        cat_cols + num_cols + cat_but_car = toplam değişken sayısı
        num_but_cat cat_cols'un içerisinde.
        Return olan 3 liste toplamı toplam değişken sayısına eşittir: cat_cols + num_cols + cat_but_car = değişken sayısı

    """


    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

In [None]:
from sklearn.model_selection import GridSearchCV, cross_validate,RandomizedSearchCV
from lightgbm import LGBMClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


import warnings
warnings.filterwarnings('ignore')


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 170)



df1 = pd.read_csv("/kaggle/input/home-credit-default-risk/bureau_balance.csv")
df2 = pd.read_csv("/kaggle/input/home-credit-default-risk/bureau.csv")
train = pd.read_csv("/kaggle/input/home-credit-default-risk/application_train.csv")
test = pd.read_csv("/kaggle/input/home-credit-default-risk/application_test.csv")

dff = train.append(test).reset_index(drop=True)
dff.head()



check_df(df1)

# Are there any missing values in the data? (bureau_balance)
df1.isnull().sum()




cat_cols, num_cols, cat_but_car = grab_col_names1(df1)

for col in cat_cols:
    cat_summary(df1, col)

for col in num_cols:
    num_summary(df1, col)




# One-Hot Encoder
bb, bb_cat = one_hot_encoder(df1, nan_as_category=False)

# Bureau balance: Perform aggregations and merge with bureau.csv
bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}

for col in bb_cat:
    bb_aggregations[col] = ['mean']

bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])

bb_agg["STATUS_C0_MEAN_SUM"] = bb_agg[["STATUS_C_MEAN", "STATUS_0_MEAN"]].sum(axis = 1)
bb_agg["STATUS_12_MEAN_SUM"] = bb_agg[["STATUS_1_MEAN", "STATUS_2_MEAN"]].sum(axis = 1)
bb_agg["STATUS_345_MEAN_SUM"] = bb_agg[["STATUS_3_MEAN", "STATUS_4_MEAN", "STATUS_5_MEAN"]].sum(axis = 1)
bb_agg["STATUS_12345_MEAN_SUM"] = bb_agg[["STATUS_1_MEAN", "STATUS_2_MEAN", "STATUS_3_MEAN", "STATUS_4_MEAN", "STATUS_5_MEAN"]].sum(axis = 1)

closed = df1[df1.STATUS == "C"]
closed = closed.groupby("SK_ID_BUREAU").MONTHS_BALANCE.min().reset_index().rename({"MONTHS_BALANCE":"MONTHS_BALANCE_FIRST_C"}, axis = 1)
closed["MONTHS_BALANCE_FIRST_C"] = np.abs(closed["MONTHS_BALANCE_FIRST_C"])
bb_agg = pd.merge(bb_agg, closed, how = "left", on = "SK_ID_BUREAU")
bb_agg["MONTHS_BALANCE_CLOSED_DIF"] = np.abs(bb_agg.MONTHS_BALANCE_MIN) - bb_agg.MONTHS_BALANCE_FIRST_C

del closed, bb_aggregations, bb_cat

bb_agg.shape

df2.head()
df2.shape

check_df(df2)

df2["CREDIT_TYPE"].head(20)
df2["AMT_CREDIT_MAX_OVERDUE"].head(20)

df = pd.merge(df2, bb_agg, how = "left", on = "SK_ID_BUREAU")
del bb_agg

df.head()

df.shape

# Correlation
corr_plot(df, remove=['SK_ID_CURR','SK_ID_BUREAU'], corr_coef = "spearman")

high_correlation(df, remove=['SK_ID_CURR','SK_ID_BUREAU'], corr_coef = "spearman", corr_value = 0.7)

#daha önce kaç kez kredi aldığı bilgisi
previous_loan_counts = df.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'PREVIOUS_LOAN_COUNTS'})

dff = dff.merge(previous_loan_counts, on = 'SK_ID_CURR', how = 'left')
dff.head()

dff.loc[dff["PREVIOUS_LOAN_COUNTS"].isnull(),"PREVIOUS_LOAN_COUNTS"]=0

df.loc[df["CNT_CREDIT_PROLONG"]!=0]


A= df.groupby('SK_ID_CURR', as_index=False)['CNT_CREDIT_PROLONG'].sum().rename(columns = {'CNT_CREDIT_PROLONG': 'PRE_CNT_CREDIT_PROLONG_SUM'})

dff = dff.merge(A, on = 'SK_ID_CURR', how = 'left')
dff.head(10)

dff["CREDIT_RATE"]=dff["PRE_CNT_CREDIT_PROLONG_SUM"]/dff["PREVIOUS_LOAN_COUNTS"]
#toplam kredi zamanı (gün)
df["CREDIT_DAY_SUM"] = df["DAYS_CREDIT"]-df["DAYS_CREDIT_ENDDATE"]

A= df.groupby('SK_ID_CURR', as_index=False)["CREDIT_DAY_SUM"].sum()
dff = dff.merge(A, on = 'SK_ID_CURR', how = 'left')

dff.loc[dff["CREDIT_DAY_SUM"].isnull(),"CREDIT_DAY_SUM"]=0
dff["CREDIT_DAY_SUM"]=dff["CREDIT_DAY_SUM"]*(-1)
dff.head()

df["ACTIVE_CREDIT_COUNT"]=[0 if i>0 else 1 for i in df["DAYS_CREDIT_ENDDATE"]]

A= df.groupby('SK_ID_CURR', as_index=False)["ACTIVE_CREDIT_COUNT"].sum()
dff = dff.merge(A, on = 'SK_ID_CURR', how = 'left')
dff.head()

dff["CREDIT_ACTIVE_RATE"]=dff["ACTIVE_CREDIT_COUNT"] / dff["PREVIOUS_LOAN_COUNTS"]

cat_cols, cat_but_car, num_cols, num_but_cat = grab_col_names(df, car_th=10)
num_cols
num_cols=[col for col in num_cols if col not in ["SK_ID_CURR" ,"SK_ID_BUREAU"]]


new_sorting(df,"CREDIT_ACTIVE",['Closed', 'Active', 'Sold', 'Bad debt'],[0,1,2,3])
df.head()


rare_analyser(df, "NEW_CREDIT_ACTIVE", cat_cols)

df=df.drop("CREDIT_CURRENCY",axis=1)
df.head()

df=df.drop("CREDIT_ACTIVE",axis=1)
df.head()

cat_cols, cat_but_car, num_cols, num_but_cat = grab_col_names(df, car_th=10)



df=rare_encoder(df, 0.001, cat_cols)

df = df.drop("CREDIT_DAY_SUM",axis=1)
df = df.drop("ACTIVE_CREDIT_COUNT",axis=1)


col=[i for i in df.columns if i not in ["SK_ID_CURR","SK_ID_BUREAU"]]
df=df.groupby('SK_ID_CURR', as_index=False)[col].sum()
dff = dff.merge(df,on='SK_ID_CURR', how="left")

# buraya kadar dilara ve sevval



In [None]:
from creditHome.eda2 import *

In [None]:
df = dff.copy()
df.head()



print("Train veri seti boyutları:",train.shape)
print("Test veri seti boyutları:",test.shape)


rate_null_columns=control_df(df,train)
useless_columns=[col for col in rate_null_columns if rate_null_columns[col] > 0.15 and col != "TARGET"]

cat_cols,num_cols=grab_cat_num(df)

for col in num_cols:
    num_analysis(df,col)
    
for col in cat_cols:
    cat_summary(df,col,True)


gg(df[cat_cols],"TARGET")




# cinsiyet problemi
df = df[~(df.CODE_GENDER.str.contains("XNA"))]
# aile durumu bilinmeyenleri drop
df = df[df.NAME_FAMILY_STATUS != "Unknown"]
# kac tane dokuman getirmis
flag = [x for x in df.columns if 'FLAG_DOC' in x]
df['DOCUMENT_COUNT'] = df[flag].sum(axis=1)
# dis kaynakli verilerden olusturulan degisken
df['EXT_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
# kredi miktarinin odeme ve mallara orani
df['CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df["AMT_ANNUITY_x"]
df['CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
# gelire gore kredi odemesi. toplam kredi miktari orani.
df['ANNUITY_TO_INCOME_RATIO'] = df["AMT_ANNUITY_x"] / df['AMT_INCOME_TOTAL']
df['CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
# gunluk kac para kazaniyor. kac gun calismis. gelirine gore orani.
df['INCOME_TO_EMPLOYED_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_EMPLOYED']
df['INCOME_TO_BIRTH_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_BIRTH']
# issizligi az mi cok mu
df['EMPLOYED_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
# arabayi ne zaman degismis orani
df['CAR_TO_EMPLOYED_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
# telefonu ne zaman degismis orani
df['PHONE_TO_EMPLOYED_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
# aile bireyi basina dusen gelir miktari
df['CNT_FAM_INCOME_RATIO'] = df['AMT_INCOME_TOTAL']/df['CNT_FAM_MEMBERS']

# DROP
cols = ["NAME_HOUSING_TYPE", "WEEKDAY_APPR_PROCESS_START", "FONDKAPREMONT_MODE", "WALLSMATERIAL_MODE", "HOUSETYPE_MODE","EMERGENCYSTATE_MODE","FLAG_MOBIL", "FLAG_EMP_PHONE","FLAG_WORK_PHONE", "FLAG_CONT_MOBILE", "FLAG_PHONE", "FLAG_EMAIL"]
df.drop(cols, axis = 1, inplace = True)

# REGION
cols = ["REG_REGION_NOT_LIVE_REGION","REG_REGION_NOT_WORK_REGION", "LIVE_REGION_NOT_WORK_REGION", "REG_CITY_NOT_LIVE_CITY","REG_CITY_NOT_WORK_CITY","LIVE_CITY_NOT_WORK_CITY"]
df["REGION"] = df[cols].sum(axis = 1)
df.drop(cols, axis = 1, inplace = True)

# Drop FLAG_DOCUMENT
df.drop(df.columns[df.columns.str.contains("FLAG_DOCUMENT")], axis = 1, inplace = True)


df["ORGANIZATION_TYPE"] = np.where(df.ORGANIZATION_TYPE.str.contains("Business Entity"), "Business Entity", df.ORGANIZATION_TYPE)
df["ORGANIZATION_TYPE"] = np.where(df.ORGANIZATION_TYPE.str.contains("Industry"), "Industry", df.ORGANIZATION_TYPE)
df["ORGANIZATION_TYPE"] = np.where(df.ORGANIZATION_TYPE.str.contains("Trade"), "Trade", df.ORGANIZATION_TYPE)
df["ORGANIZATION_TYPE"] = np.where(df.ORGANIZATION_TYPE.str.contains("Transport"), "Transport", df.ORGANIZATION_TYPE)
df["ORGANIZATION_TYPE"] = np.where(df.ORGANIZATION_TYPE.isin(["School", "Kindergarten", "University"]), "Education", df.ORGANIZATION_TYPE)
df["ORGANIZATION_TYPE"] = np.where(df.ORGANIZATION_TYPE.isin(["Emergency","Police", "Medicine","Goverment", "Postal", "Military", "Security Ministries", "Legal Services"]), "Public", df.ORGANIZATION_TYPE)
df["ORGANIZATION_TYPE"] = np.where(df.ORGANIZATION_TYPE.isin(["Bank", "Insurance"]), "Finance", df.ORGANIZATION_TYPE)
df["ORGANIZATION_TYPE"] = np.where(df.ORGANIZATION_TYPE.isin(["Realtor", "Housing"]), "House", df.ORGANIZATION_TYPE)
df["ORGANIZATION_TYPE"] = np.where(df.ORGANIZATION_TYPE.isin(["Hotel", "Restaurant"]), "HotelRestaurant", df.ORGANIZATION_TYPE)
df["ORGANIZATION_TYPE"] = np.where(df.ORGANIZATION_TYPE.isin(["Cleaning","Electricity", "Telecom", "Mobile", "Advertising", "Religion", "Culture"]), "Other", df.ORGANIZATION_TYPE)

# mumkun degil bu deger
df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

# musteri teli en son ne zaman degismis?
df['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)



cat_cols, num_cols, cat_but_car = grab_col_names(df)
cat_cols = [x for x in cat_cols if x !='TARGET']

df = one_hot_encoder(df,cat_cols,True)


df.columns = list(map(lambda x: str(x).replace(" ", "_").replace("-", "_").replace("_/_", "_").upper(), df.columns))
import re
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

train = df[df.TARGET.isnull() == False]
test = df[df.TARGET.isnull()]
x_train = train.drop(["TARGET", "SK_ID_CURR"], axis = 1)
x_test = test.drop(["TARGET", "SK_ID_CURR"], axis = 1)
y_train = train.TARGET





In [None]:
"""

lgbm_model = LGBMClassifier(
                n_estimators=4000,
                learning_rate=0.01,
                max_depth = 11,
                num_leaves=58,
                colsample_bytree=0.613,
                subsample=0.708,
                max_bin=407,
                reg_alpha=3.564,
                reg_lambda=4.930,
                min_child_weight= 6,
                min_child_samples=165,
                silent=-1,
                verbose=-1,
                ).fit(x_train, y_train, eval_set=[(x_train, y_train)],
        eval_metric='auc', verbose=200)


#cv 10 katli normalde
cv_results_1 = cross_validate(lgbm_model, x_train, y_train, cv=10, scoring=["accuracy", "f1", "roc_auc"])
cv_results_1['test_accuracy'].mean()
cv_results_1['test_f1'].mean()
cv_results_1['test_roc_auc'].mean()

def plot_importance(model, features, num=len(x_train), save=False):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importances.png')


plot_importance(lgbm_model,x_train)
plot_importance(lgbm_model,x_train,num=40)

feature_importances = pd.DataFrame({'feature':x_train.columns,
                                    'importance':lgbm_model.feature_importances_})
#  onemli
important_features = feature_importances[feature_importances['importance'] > 0]

#  onemsiz
unimportant_features = feature_importances[feature_importances['importance'] < 1]

important = important_features.sort_values(ascending=False,by='importance')

important.to_excel('important_features.xlsx')
unimportant_features.to_excel('unimportant_features.xlsx')


# unimportant featurelari dustukten sonra bi degisim olmadi. dustukten sonra hiperparemetre
# opt. yapilabilir.
df_2 = df.drop(unimportant_features['feature'],inplace = True,axis=1)

train_2 = df[df.TARGET.isnull() == False]
test_2 = df[df.TARGET.isnull()]
x_train_2 = train.drop(["TARGET", "SK_ID_CURR"], axis = 1)
x_test_2 = test.drop(["TARGET", "SK_ID_CURR"], axis = 1)
y_train_2 = train.TARGET

lgbm_model_2 = LGBMClassifier(
                n_estimators=4000,
                learning_rate=0.01,
                max_depth = 11,
                num_leaves=58,
                colsample_bytree=0.613,
                subsample=0.708,
                max_bin=407,
                reg_alpha=3.564,
                reg_lambda=4.930,
                min_child_weight= 6,
                min_child_samples=165,
                silent=-1,
                verbose=-1,
                ).fit(x_train_2, y_train_2, eval_set=[(x_train_2, y_train_2)],
        eval_metric='auc', verbose=200)

cv_results_2 = cross_validate(lgbm_model_2, x_train_2, y_train_2, cv=3, scoring=["accuracy", "f1", "roc_auc"])
cv_results_2['test_accuracy'].mean()
cv_results_2['test_f1'].mean()
cv_results_2['test_roc_auc'].mean()


lgbm = LGBMClassifier()
lgbm_params = {"learning_rate": [0.01, 0.1, 0.001,0.05,1,0.005],
               "n_estimators": [100, 300, 500, 1000],
               "colsample_bytree": [0.5, 0.7, 1,1.5],
               'num_leaves' : [31,10,15,45],
               'max_depth':[-1,1,3,5,10],
               'min_split_gain':[0.,0.1,0.5,1,0.01,0.05]}

# random search
random_lgb = RandomizedSearchCV(estimator=lgbm,
                               param_distributions=lgbm_params,
                               n_iter=100,  # denenecek parametre sayısı
                               cv=3,
                               verbose=True,
                               random_state=42,
                               n_jobs=-1).fit(x_train,y_train)
random_lgb.best_score_
random_lgb.best_params_

# grid search icin. randomdan gelen degerleri ekleyerek calistir.
lgbm_params2 = {"learning_rate": [0.01, 0.1, 0.001,0.005],
               "n_estimators": [2000,4000],
               "colsample_bytree": [0.5, 0.7, 1],
               'num_leaves' : [30,15,45,60],
               'max_depth':[-1,3,10],
               'min_split_gain':[0.,0.1,0.5,1]}


# grid search
lgbm = LGBMClassifier()
lgm_grid = GridSearchCV(lgbm, lgbm_params2, cv=3, n_jobs=-1, verbose=True).fit(x_train, y_train)
lgm_grid.best_params_



# creating final submission
submission = pd.DataFrame({
    "SK_ID_CURR": test.SK_ID_CURR,
    "TARGET": lgbm_model.predict_proba(x_test)[:,1]
})

"""