In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import PolynomialFeatures

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Caricare i dati di training, i dati di test, e verificare il # di osservazioni e il # di caratteristiche per ciascuno dei due
app_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
print('Shape di addestramento: ', app_train.shape)
app_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
print('Shape di addestramento: ', app_test.shape)

In [None]:
# Create a label encoder object
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

def mylabelencoder(df):
    le_count = 0
    # Iterate through the columns
    for col in df:
        if df[col].dtype == 'object':
            # If 2 or fewer unique categories
            if len(list(df[col].unique())) <= 3:
                df[col] = df[col].fillna("")
                # Train on the training data
                le.fit(df[col])
                # Transform both training and testing data
                df[col] = le.transform(df[col])

                # Keep track of how many columns were label encoded
                le_count += 1            
    print('%d columns were label encoded.' % le_count)

In [None]:
# Controlla che codifichi lo stesso numero di colonne su train e su test
mylabelencoder(app_train)
mylabelencoder(app_test)

In [None]:
# One Hot Encoding delle altre feature categoriche
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

In [None]:
# Riallineamento delle colonne
# Cerca le colonne differenti tra i due set, togliendo dal train la target per cercare le differenze
colset_train = set(app_train.columns)
colset_test = set(app_test.columns)
colset_train.discard('TARGET')
colset_train.difference(colset_test)

In [None]:
for k in list(colset_train.difference(colset_test)):
    app_test[k] = 0 # Default a 0 perché se le colonne non appaiono in test sono sempre non-hot
    
print(app_train.shape)
print(app_test.shape)
set(app_train.columns).difference(set(app_test.columns))

# Test ha una colonna in meno perché non contiene TARGET

In [None]:
# Analisi degli outlier
app_train.dtypes.value_counts()

In [None]:
(app_train["DAYS_BIRTH"]/-365).describe()

In [None]:
(app_train["DAYS_EMPLOYED"]/365).describe()

In [None]:
app_train["DAYS_EMPLOYED"].plot(kind="hist")

In [None]:
 app_train["DAYS_EMPLOYED"] == 365243

In [None]:
# Identificare e filtrare le osservazioni che hanno la caratt. DAYS EMPLOYED con un valore outlier
# Per caso, l'outlier è legato al fatto che il prestito è andato in default? In che proporzione di occasioni?
anomalies = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED_ANOM'] = anomalies
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
(app_train['DAYS_EMPLOYED']/365).plot.hist(title = 'Days Employment Histogram');


In [None]:
app_train["DAYS_EMPLOYED_ANOM"].value_counts()

In [None]:
import seaborn as sns
#sns.pairplot(app_train.iloc[:,:10])

In [None]:
# Plotting as desired
# app_train.iloc[:,27:70].plot.hist(subplots=True, legend=False)

In [None]:
# Analisi delle collinearità
#correlations = app_train.corr()

In [None]:
#correlations

In [None]:
#correlations["TARGET"].sort_values().head(10)

In [None]:
#correlations["TARGET"].sort_values().tail(10)

In [None]:
app_train['DAYS_BIRTH'].corr(app_train['TARGET'])

In [None]:
(app_train['DAYS_BIRTH']/-365).corr(app_train['TARGET'])

In [None]:
app_train["EXT_SOURCE_3"].plot(kind="hist")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
plt.figure(figsize = (10, 8))

sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'DAYS_BIRTH'] / -365, label = 'target == 0' )
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'DAYS_BIRTH'] / -365, label = 'target == 1' )

# Metti la legenda
plt.legend()


In [None]:
# Creare dei kernel plot su EXT_SOURCE(1,2,3) rispetto a TARGET per valutarne la "quantità di segnale"

sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'EXT_SOURCE_1'], label = 'target == 0' )
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'EXT_SOURCE_1'], label = 'target == 1' )

# Metti la legenda
plt.legend()

In [None]:
# Creare dei kernel plot su EXT_SOURCE(1,2,3) rispetto a TARGET per valutarne la "quantità di segnale"

sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'EXT_SOURCE_2'], label = 'target == 0' )
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'EXT_SOURCE_2'], label = 'target == 1' )

# Metti la legenda
plt.legend()

In [None]:
# Creare dei kernel plot su EXT_SOURCE(1,2,3) rispetto a TARGET per valutarne la "quantità di segnale"

sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'EXT_SOURCE_3'], label = 'target == 0' )
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'EXT_SOURCE_3'], label = 'target == 1' )

# Metti la legenda
plt.legend()

In [None]:
# Extract the EXT_SOURCE variables and show correlations
ext_data = app_train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
ext_data_corrs

In [None]:
plt.figure(figsize = (8, 6))

# Heatmap of correlations
sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)
plt.title('Correlation Heatmap');

In [None]:
# Feature Engineering
# Combinazioni polinomiali delle caratteristiche
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
poly_features = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
poly_features_test = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
poly_target = app_train['TARGET']

In [None]:
# Imputazione dei valori mancanti
# https://scikit-learn.org/stable/modules/impute.html
poly_features['EXT_SOURCE_1'].plot(kind="hist")

In [None]:
poly_features['EXT_SOURCE_1'].isnull().value_counts()

In [None]:
# Inseriamo NaN al posto di null
poly_features['EXT_SOURCE_1'] = poly_features['EXT_SOURCE_1'].fillna(np.nan)

In [None]:
poly_features

In [None]:
# Uso di SimpleImputer per imputazione automatica di valori tramite una funzione nelle celle vuote
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='median')
poly_features_imputed = imp.fit_transform(poly_features)
poly_features_imputed_df = pd.DataFrame(poly_features_imputed, columns=['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])

In [None]:
poly_features_test_imputed = imp.fit_transform(poly_features_test)
poly_features_test_imputed_df = pd.DataFrame(poly_features_test_imputed, columns=['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])

In [None]:
# Tappati tutti i buchi con le mediane
poly_features_imputed_df.isnull().value_counts()
poly_features_test_imputed_df.isnull().value_counts()

In [None]:
# Calcola combinazioni polinomiali delle feature del dataframe Df
from sklearn.preprocessing import PolynomialFeatures
def poly_transformer(df):
    poly_transformer = PolynomialFeatures(degree = 3)
    poly_transformer.fit(df)
    poly_features_df = pd.DataFrame(poly_transformer.transform(df), columns=poly_transformer.get_feature_names(df.columns))
    poly_features_df = poly_features_df.drop(labels='1', axis=1)
    return poly_features_df

In [None]:
# Ora finalmente posso usare le polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly_transformer = PolynomialFeatures(degree = 3)
poly_transformer.fit(poly_features_imputed_df)

In [None]:
poly_transformer.transform(poly_features_imputed_df)

In [None]:
poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])

In [None]:
poly_features_df = pd.DataFrame(poly_transformer.transform(poly_features_imputed_df), columns=poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']))

In [None]:
poly_features_df

In [None]:
poly_features_df = poly_features_df.drop(labels='1', axis=1)

In [None]:
poly_features_df['TARGET'] = poly_target

In [None]:
# Vediamo ora se rispetto alle sole features predominanti di base, quelle costruite in polinomiali risultano maggiormente correlate al target
poly_collinears = poly_features_df.corr()

In [None]:
poly_collinears['TARGET'].sort_values().head(5)

In [None]:
poly_collinears['TARGET'].sort_values().tail(5)

In [None]:
poly_features_test_df = pd.DataFrame(poly_transformer.transform(poly_features_test_imputed_df), columns=poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']))

In [None]:
poly_features_test_df = poly_features_test_df.drop(labels='1', axis=1)

In [None]:
#prova_poly_features

In [None]:
# Provo la mia funzione
#prova_poly_features = poly_features_imputed_df.copy() # copia per separare i df nella trasformazione
#pippo = poly_transformer(prova_poly_features)

In [None]:
# Sistemare la poly transformation in una funzione
# Creare delle caratteristiche derivate ("in modo business")


* CREDIT_INCOME_PERCENT: the percentage of the credit amount relative to a client's income
* ANNUITY_INCOME_PERCENT: the percentage of the loan annuity relative to a client's income
* CREDIT_TERM: the length of the payment in months (since the annuity is the monthly amount due
* DAYS_EMPLOYED_PERCENT: the percentage of the days employed relative to the client's age

In [None]:
# Creo una copia per valore e non per riferimento dei dati di train e di test
app_train_domain = app_train.copy()
app_test_domain = app_test.copy()

In [None]:
# Funzione che calcola nuove caratteristiche di dominio
def calculate_domain_data(df):    
    df['CREDIT_INCOME_PERCENT'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['ANNUITY_INCOME_PERCENT'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['CREDIT_TERM'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    df['DAYS_EMPLOYED_PERCENT'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']

In [None]:
calculate_domain_data(app_train_domain)

In [None]:
app_train_domain

In [None]:
# Infine, provare con la visualizzazione kernel density a dimostrare se qualcuna di queste nuove caratteristiche create (sia polinominali, sia di business)
# ha un particolare potere "separatorio" rispetto al target
def mykdeplot(df, field):
    sns.kdeplot(df.loc[app_train['TARGET'] == 0, field], label = 'target == 0' )
    sns.kdeplot(df.loc[app_train['TARGET'] == 1, field], label = 'target == 1' )
    
    # Metti la legenda
    plt.legend()

In [None]:
mykdeplot(poly_features_df, 'EXT_SOURCE_2 EXT_SOURCE_3')

In [None]:
app_train

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='median')
app_train_imputed = imp.fit_transform(app_train)
app_train_imputed_df = pd.DataFrame(app_train_imputed, columns=app_train.columns)

In [None]:
app_test_imputed = imp.fit_transform(app_test)
app_test_imputed_df = pd.DataFrame(app_test_imputed, columns=app_test.columns)

In [None]:
# Modello di "Baseline", fatto schiaffando tutto dentro senza alcuna remora
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(C = 0.0001, solver='liblinear')

train = app_train_imputed_df.copy()
train = train.drop(columns = ['TARGET'])
train_labels = app_train['TARGET']



In [None]:
print(train.shape)
print(app_test.shape)

In [None]:
set(train.columns).difference(set(app_test.columns))

In [None]:
train = train.drop('DAYS_EMPLOYED_ANOM', axis=1)

In [None]:
train.shape

In [None]:
log_reg.fit(train, train_labels)

In [None]:
# uso il modello fittato per fare la previsione
#log_reg.predict_proba(app_train_imputed_df.drop('TARGET', axis=1))
log_reg.predict_proba(app_test_imputed_df)[:,1]

In [None]:
submit = app_test[['SK_ID_CURR']]

In [None]:
submit['TARGET'] = log_reg.predict_proba(app_test_imputed_df)[:,1]

In [None]:
submit

In [None]:
submit.to_csv('submission.csv', index=False)

In [None]:
#from sklearn import svm
#clf = svm.SVC(kernel='linear', C=1, probability=True).fit(X_train, y_train)
#clf.score(X_test, y_test)

In [None]:
# 1) Ripulire tutto il foglio in modo che si esegua completamente dall'inizio alla fine e scriva il file "submission"
# 2) Anziché fittare il modello su tutto il df "app_train", usare train_test_split per splittare app_train in app_train_train e app_train_test
# 3) Rifittare LogisticRegression usando app_train_train
# 4) Usare score() per trovare lo score passando a score i dati app_train_test (e le relative label di TARGET)
# 5) Ripetere usando SVC (Support Vector Classifier)
# 6) RIpetere usando RandomForestClassifier
# 7) Ripetere cambiando qualcuno degli iperparametri di SVC o di RFC (es. "C" in SVC, o max_depth in RFC)

In [None]:
# Provo usando la random_forest
from sklearn.ensemble import RandomForestClassifier

# Feature names
features = list(train.columns)

# Make the random forest classifier
random_forest = RandomForestClassifier(max_depth=4, n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)

# Train on the training data
random_forest.fit(train, train_labels)

# Extract feature importances
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})

# Make predictions on the test data
predictions = random_forest.predict_proba(app_test_imputed_df)[:, 1]

In [None]:
# Make a submission dataframe
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = predictions

# Save the submission dataframe
submit.to_csv('random_forest_baseline.csv', index = False)
submit.to_csv('submission.csv', index = False)

In [None]:
# Extract feature importances
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})
# Uso delle feature importance
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances= feature_importances.set_index('feature')

In [None]:
feature_importances

In [None]:
feature_importances[:20].plot(kind="barh", )

In [None]:
# 1) Rispetto allo scoring di baseline, provare con un modello con meno caratteristiche, nell'idea di privilegiare le caratteristiche con più segnale
# 2) Mergiare altri dati che sembrano promettenti, ripulirli se necessario, fittare e sottomettere il nuovo dataset per verificare lo score

# 3) Ricchi premi e cotillions ai primi 3 score