In [1]:
import re
import json
import time as time
import numpy as np
import pandas as pd
from functools import partial
from datetime import date, timedelta

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import plotly.figure_factory as ff
import plotly.express as px
import plotly.offline as pyo

## Own specific functions 
from functions import *

In [2]:
def calculate_years(days):
    """
    Method used to calculate years based on date (today - quantity of days).

    Parameters:
    -----------------
        days (int): Numbers of day to rest of today

    Returns:
    -----------------
        years (int): Numbers of years
    """

    today = date.today()
    initial_date = today - timedelta(abs(days))
    years = today.year - initial_date.year - ((today.month, today.day) < (initial_date.month, initial_date.day))

    return years

In [3]:
# Columns to read on CSVs
COLUMNS = [
    "SK_ID_CURR", "CODE_GENDER", "DAYS_BIRTH", "DAYS_EMPLOYED",
    "CNT_CHILDREN", "FLAG_OWN_REALTY", "FLAG_OWN_CAR",
    "AMT_INCOME_TOTAL", "AMT_CREDIT", "TARGET"
]

In [4]:
# Reading the csv
df_clients_to_predict = pd.read_csv("datasets/df_customers_to_predict.csv")

In [5]:
df_analysis(df_clients_to_predict, "df_clients_to_predict", analysis_type="header")


Analysis Header of df_clients_to_predict dataset
--------------------------------------------------------------------------------
- Dataset shape:			 48744 rows and 796 columns
- Total of NaN values:			 0
- Percentage of NaN:			 0.0 %
- Total of infinite values:		 0
- Percentage of infinite values:	 0.0 %
- Total of full duplicates rows:	 0
- Total of empty rows:			 0
- Total of empty columns:		 0
- Unique indexes:			 True
- Memory usage:				 296.0 MB


In [6]:
# Reading the csv
df_optimized = pd.read_csv("datasets/df_optimized.csv", usecols=COLUMNS, low_memory=True)

In [7]:
df_analysis(df_optimized, "df_optimized", analysis_type="header")


Analysis Header of df_optimized dataset
--------------------------------------------------------------------------------
- Dataset shape:			 356251 rows and 10 columns
- Total of NaN values:			 48744
- Percentage of NaN:			 1.37 %
- Total of infinite values:		 0
- Percentage of infinite values:	 0.0 %
- Total of full duplicates rows:	 0
- Total of empty rows:			 0
- Total of empty columns:		 0
- Unique indexes:			 True
- Memory usage:				 27.2 MB


In [8]:
df_current_clients = df_optimized[df_optimized["TARGET"].notnull()]

In [9]:
df_analysis(df_current_clients, "df_current_clients", analysis_type="header")


Analysis Header of df_current_clients dataset
--------------------------------------------------------------------------------
- Dataset shape:			 307507 rows and 10 columns
- Total of NaN values:			 0
- Percentage of NaN:			 0.0 %
- Total of infinite values:		 0
- Percentage of infinite values:	 0.0 %
- Total of full duplicates rows:	 0
- Total of empty rows:			 0
- Total of empty columns:		 0
- Unique indexes:			 True
- Memory usage:				 25.8 MB


In [10]:
# saving the optimized dataset 
df_current_clients.to_csv("datasets\df_current_clients_reduced.csv", index=False)

In [11]:
df_analysis(df_current_clients, "df_current_clients", analysis_type="header")


Analysis Header of df_current_clients dataset
--------------------------------------------------------------------------------
- Dataset shape:			 307507 rows and 10 columns
- Total of NaN values:			 0
- Percentage of NaN:			 0.0 %
- Total of infinite values:		 0
- Percentage of infinite values:	 0.0 %
- Total of full duplicates rows:	 0
- Total of empty rows:			 0
- Total of empty columns:		 0
- Unique indexes:			 True
- Memory usage:				 33.9 MB


In [12]:
xx

NameError: name 'xx' is not defined

In [None]:
df_optimized.to_csv("datasets\df_optimized_and_reduced.csv", index=False)

In [None]:
df_optimized["AGE"] = df_optimized["DAYS_BIRTH"].apply(lambda x: calculate_years(x))

In [None]:
df_optimized.head()

In [None]:
df_customer_selected = df_clients_to_predict[df_clients_to_predict["SK_ID_CURR"]==100013]
df_customer_selected["AGE"] = df_customer_selected["DAYS_BIRTH"].apply(lambda x: calculate_years(x))
df_customer_selected

In [None]:
df_customer_selected["AGE"].values[0]

In [None]:
df_optimized_by_target_repaid = df_optimized[df_optimized["TARGET"] == 0]
df_optimized_by_target_not_repaid = df_optimized[df_optimized["TARGET"] == 1]

In [None]:
ages_data_repaid = df_optimized_by_target_repaid.groupby("AGE").size()
ages_data_repaid = pd.DataFrame(ages_data_repaid).reset_index()
ages_data_repaid.columns = ["AGE", "AMOUNT"]
ages_data_repaid = ages_data_repaid.set_index("AGE").to_dict()["AMOUNT"]


ages_data_not_repaid = df_optimized_by_target_not_repaid.groupby("AGE").size()
ages_data_not_repaid = pd.DataFrame(ages_data_not_repaid).reset_index()
ages_data_not_repaid.columns = ["AGE", "AMOUNT"]
ages_data_not_repaid = ages_data_not_repaid.set_index("AGE").to_dict()["AMOUNT"]

In [None]:
ages_data_repaid_list = [key for key, val in ages_data_repaid.items() for _ in range(val)]
ages_data_not_repaid_list = [key for key, val in ages_data_not_repaid.items() for _ in range(val)]

In [None]:
config = {
    "displayModeBar": False,
    "displaylogo": False
}

group_labels = ["Repaid", "Not repaid"]

fig_ages = ff.create_distplot([ages_data_repaid_list, ages_data_not_repaid_list], 
                              group_labels, show_hist=False, show_rug=False, 
                             colors=["Green", "Red"])
fig_ages.update_layout(
    paper_bgcolor="white",
    font={
        "family": "sans serif"
    },
    autosize=False,
    width=500,
    height=360,
    margin=dict(
        l=50, r=50, b=0, t=20, pad=0
    ),
    title={
        "text" : "Client age vs Current clients",
        "y" : 1,
        "x" : 0.45,
        "xanchor" : "center",
        "yanchor" : "top"
    },
    xaxis_title="Ages",
    yaxis_title="Density",
    legend={
        "traceorder" : "normal"
    }
)
fig_ages.add_vline(x=df_customer_selected["AGE"].values[0], line_width=3,
                   line_dash="dash", line_color="green", annotation_text="client's age")
pyo.plot(fig_ages, config=config, filename="example2.html")

In [None]:
df_optimized["YEARS_WORK"] = df_optimized["DAYS_EMPLOYED"].apply(lambda x: calculate_years(x))

In [None]:
df_optimized.head()

In [None]:
df_customer_selected = df_clients_to_predict[df_clients_to_predict["SK_ID_CURR"]==100013]
df_customer_selected["YEARS_WORK"] = df_customer_selected["DAYS_EMPLOYED"].apply(lambda x: calculate_years(x))
df_customer_selected["YEARS_WORK"].values[0]

In [None]:
df_optimized_by_target_repaid = df_optimized[df_optimized["TARGET"] == 0]
df_optimized_by_target_not_repaid = df_optimized[df_optimized["TARGET"] == 1]

In [None]:
ages_data_repaid = df_optimized_by_target_repaid.groupby("YEARS_WORK").size()
ages_data_repaid = pd.DataFrame(ages_data_repaid).reset_index()
ages_data_repaid.columns = ["YEARS_WORK", "AMOUNT"]
ages_data_repaid = ages_data_repaid.set_index("YEARS_WORK").to_dict()["AMOUNT"]


ages_data_not_repaid = df_optimized_by_target_not_repaid.groupby("YEARS_WORK").size()
ages_data_not_repaid = pd.DataFrame(ages_data_not_repaid).reset_index()
ages_data_not_repaid.columns = ["YEARS_WORK", "AMOUNT"]
ages_data_not_repaid = ages_data_not_repaid.set_index("YEARS_WORK").to_dict()["AMOUNT"]

In [None]:
ages_data_repaid_list = [key for key, val in ages_data_repaid.items() for _ in range(val)]
ages_data_not_repaid_list = [key for key, val in ages_data_not_repaid.items() for _ in range(val)]

In [None]:
config = {
    "displayModeBar": False,
    "displaylogo": False
}

group_labels = ["Repaid", "Not repaid"]

fig_ages = ff.create_distplot([ages_data_repaid_list, ages_data_not_repaid_list], 
                              group_labels, show_hist=False, show_rug=False, 
                             colors=["Green", "Red"])
fig_ages.update_layout(
    paper_bgcolor="white",
    font={
        "family": "sans serif"
    },
    autosize=False,
    width=500,
    height=360,
    margin=dict(
        l=50, r=50, b=0, t=20, pad=0
    ),
    title={
        "text" : "Client years worked vs Current clients",
        "y" : 1,
        "x" : 0.45,
        "xanchor" : "center",
        "yanchor" : "top"
    },
    xaxis_title="Years worked",
    yaxis_title="Density",
    legend={
        "traceorder" : "normal"
    }
)
fig_ages.add_vline(x=df_customer_selected["YEARS_WORK"].values[0], line_width=3,
                   line_dash="dash", line_color="green", annotation_text="years worked by the client")
pyo.plot(fig_ages, config=config, filename="example3.html")

In [None]:
df_customer_selected = df_clients_to_predict[df_clients_to_predict["SK_ID_CURR"]==100013]
df_customer_selected

In [None]:
df_optimized_by_target_repaid = df_current_clients[df_current_clients["TARGET"] == 0]
df_optimized_by_target_not_repaid = df_current_clients[df_current_clients["TARGET"] == 1]

In [None]:
print(df_optimized["AMT_INCOME_TOTAL"].max())
print(df_optimized["AMT_INCOME_TOTAL"].min())

In [None]:
df_analysis(df_current_clients, "df_optimized")

In [None]:
df_analysis(df_optimized, "df_optimized")

In [27]:
income_data = df_optimized.groupby(["AMT_INCOME_TOTAL", "TARGET"]).size()
income_data = pd.DataFrame(income_data).reset_index()
income_data.columns = ["AMT_INCOME_TOTAL", "TARGET", "AMOUNT"]
#income_data = income_data.set_index("AMT_INCOME_TOTAL").to_dict()["AMOUNT"]

In [30]:
income_data

Unnamed: 0,AMT_INCOME_TOTAL,TARGET,AMOUNT
0,25650.0,0.0,1
1,25650.0,1.0,1
2,26100.0,0.0,3
3,26460.0,0.0,1
4,26550.0,0.0,2
...,...,...,...
2861,6750000.0,0.0,1
2862,9000000.0,0.0,1
2863,13500000.0,0.0,1
2864,18000090.0,0.0,1


In [None]:
len(data_repaid)

In [None]:
data_repaid_list = [key for key, val in data_repaid.items() for _ in range(val)]
data_not_repaid_list = [key for key, val in data_not_repaid.items() for _ in range(val)]

In [32]:
config = {
    "displayModeBar": False,
    "displaylogo": False
}

group_labels = ["Repaid", "Not repaid"]

fig_income = px.histogram(new_df, x="AMT_INCOME_TOTAL", color="TARGET", 
                          color_discrete_map={0:"Green", 1:"Red"}, labels={0:"Green", 1:"Red"})

fig_income.update_layout(
    paper_bgcolor="white",
    font={
        "family": "sans serif"
    },
    autosize=False,
    width=500,
    height=360,
    margin=dict(
        l=50, r=50, b=0, t=20, pad=0
    ),
    title={
        "text" : "Client's Income vs Current clients",
        "y" : 1,
        "x" : 0.45,
        "xanchor" : "center",
        "yanchor" : "top"
    },
    xaxis_title="Income",
    legend_title_text= "",
    showlegend=False,
    xaxis_range=[25000, 300000]
)
fig_income.add_vline(x=df_customer_selected["AMT_INCOME_TOTAL"].values[0], line_width=3,
                   line_dash="dash", line_color="blue", annotation_text="Client's income")
pyo.plot(fig_income, config=config, filename="example4.html")

'example4.html'

In [22]:
example = df_current_clients[["AMT_INCOME_TOTAL", "TARGET"]].to_dict()

In [23]:
new_df = pd.DataFrame.from_dict(example)

In [24]:
new_df

Unnamed: 0,AMT_INCOME_TOTAL,TARGET
0,202500.0,1.0
1,270000.0,0.0
2,67500.0,0.0
3,135000.0,0.0
4,121500.0,0.0
...,...,...
307502,157500.0,0.0
307503,72000.0,0.0
307504,153000.0,0.0
307505,171000.0,1.0


In [13]:
df_customer_selected = df_clients_to_predict[df_clients_to_predict["SK_ID_CURR"]==100013]
df_customer_selected

Unnamed: 0,SK_ID_CURR,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
2,100013,0,1,0,0,202500.0,663264.0,69777.0,630000.0,0.019101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96.0


In [14]:
df_optimized_by_target_repaid = df_current_clients[df_current_clients["TARGET"] == 0]
df_optimized_by_target_not_repaid = df_current_clients[df_current_clients["TARGET"] == 1]

In [15]:
print(df_optimized["AMT_INCOME_TOTAL"].max())
print(df_optimized["AMT_INCOME_TOTAL"].min())

117000000.0
25650.0


In [16]:
df_analysis(df_current_clients, "df_optimized")


Analysis Header of df_optimized dataset
--------------------------------------------------------------------------------
- Dataset shape:			 307507 rows and 10 columns
- Total of NaN values:			 0
- Percentage of NaN:			 0.0 %
- Total of infinite values:		 0
- Percentage of infinite values:	 0.0 %
- Total of full duplicates rows:	 0
- Total of empty rows:			 0
- Total of empty columns:		 0
- Unique indexes:			 True
- Memory usage:				 33.9 MB

Detailed analysis of df_optimized dataset
------------------------------------------------------------------------------------------------------------------------


Unnamed: 0,name,type,records,unique,# NaN,% NaN,mean,min,25%,50%,75%,max,std
0,SK_ID_CURR,int64,307507,307507,0,0.0,278181.52726,100002.0,189146.5,278203.0,367143.5,456255.0,102790.13298
1,TARGET,float64,307507,2,0,0.0,0.08073,0.0,0.0,0.0,0.0,1.0,0.27242
2,CODE_GENDER,int64,307507,2,0,0.0,0.65835,0.0,0.0,1.0,1.0,1.0,0.47426
3,FLAG_OWN_CAR,int64,307507,2,0,0.0,0.34011,0.0,0.0,0.0,1.0,1.0,0.47375
4,FLAG_OWN_REALTY,int64,307507,2,0,0.0,0.30633,0.0,0.0,0.0,1.0,1.0,0.46097
5,CNT_CHILDREN,int64,307507,15,0,0.0,0.41705,0.0,0.0,0.0,1.0,19.0,0.72212
6,AMT_INCOME_TOTAL,float64,307507,2548,0,0.0,168797.68578,25650.0,112500.0,147150.0,202500.0,117000000.0,237124.62732
7,AMT_CREDIT,float64,307507,5603,0,0.0,599028.59673,45000.0,270000.0,513531.0,808650.0,4050000.0,402492.60186
8,DAYS_BIRTH,int64,307507,17460,0,0.0,-16037.02727,-25229.0,-19682.0,-15750.0,-12413.0,-7489.0,4363.98242
9,DAYS_EMPLOYED,float64,307507,12574,0,0.0,-2386.39926,-17912.0,-2760.0,-2219.0,-933.0,0.0,2117.355


In [17]:
df_analysis(df_optimized, "df_optimized")


Analysis Header of df_optimized dataset
--------------------------------------------------------------------------------
- Dataset shape:			 356251 rows and 10 columns
- Total of NaN values:			 48744
- Percentage of NaN:			 1.37 %
- Total of infinite values:		 0
- Percentage of infinite values:	 0.0 %
- Total of full duplicates rows:	 0
- Total of empty rows:			 0
- Total of empty columns:		 0
- Unique indexes:			 True
- Memory usage:				 27.2 MB

Detailed analysis of df_optimized dataset
------------------------------------------------------------------------------------------------------------------------


Unnamed: 0,name,type,records,unique,# NaN,% NaN,mean,min,25%,50%,75%,max,std
0,SK_ID_CURR,int64,356251,356251,0,0.0,278128.87008,100001.0,189065.5,278129.0,367192.5,456255.0,102842.06889
2,CODE_GENDER,int64,356251,2,0,0.0,0.66,0.0,0.0,1.0,1.0,1.0,0.47371
3,FLAG_OWN_CAR,int64,356251,2,0,0.0,0.3397,0.0,0.0,0.0,1.0,1.0,0.47361
4,FLAG_OWN_REALTY,int64,356251,2,0,0.0,0.30676,0.0,0.0,0.0,1.0,1.0,0.46115
5,CNT_CHILDREN,int64,356251,16,0,0.0,0.41431,0.0,0.0,0.0,1.0,20.0,0.72038
6,AMT_INCOME_TOTAL,float64,356251,2741,0,0.0,170115.87309,25650.0,112500.0,153000.0,202500.0,117000000.0,223508.01889
7,AMT_CREDIT,float64,356251,6480,0,0.0,587769.52903,45000.0,270000.0,500211.0,797557.5,4050000.0,398625.30207
8,DAYS_BIRTH,int64,356251,17467,0,0.0,-16041.27669,-25229.0,-19676.0,-15755.0,-12425.0,-7338.0,4358.79853
9,DAYS_EMPLOYED,float64,356251,12896,0,0.0,-2396.67576,-17912.0,-2781.0,-2243.0,-949.0,0.0,2112.04293
1,TARGET,float64,307507,2,48744,13.68,0.08073,0.0,0.0,0.0,0.0,1.0,0.27242


In [18]:
data_repaid = df_optimized_by_target_repaid.groupby("AMT_CREDIT").size()
data_repaid = pd.DataFrame(data_repaid).reset_index()
data_repaid.columns = ["AMT_CREDIT", "AMOUNT"]
data_repaid = data_repaid.set_index("AMT_CREDIT").to_dict()["AMOUNT"]


data_not_repaid = df_optimized_by_target_not_repaid.groupby("AMT_CREDIT").size()
data_not_repaid = pd.DataFrame(data_not_repaid).reset_index()
data_not_repaid.columns = ["AMT_CREDIT", "AMOUNT"]
data_not_repaid = data_not_repaid.set_index("AMT_CREDIT").to_dict()["AMOUNT"]

In [19]:
data_repaid_list = [key for key, val in data_repaid.items() for _ in range(val)]
data_not_repaid_list = [key for key, val in data_not_repaid.items() for _ in range(val)]

In [20]:
config = {
    "displayModeBar": False,
    "displaylogo": False
}

group_labels = ["Repaid", "Not repaid"]

fig_ages = ff.create_distplot([data_repaid_list, data_not_repaid_list], 
                              group_labels, show_hist=False, show_rug=False, 
                             colors=["Green", "Red"])
fig_ages.update_layout(
    paper_bgcolor="white",
    font={
        "family": "sans serif"
    },
    autosize=False,
    width=500,
    height=360,
    margin=dict(
        l=50, r=50, b=0, t=20, pad=0
    ),
    title={
        "text" : "Client's amt income vs Current clients",
        "y" : 1,
        "x" : 0.45,
        "xanchor" : "center",
        "yanchor" : "top"
    },
    xaxis_title="Income",
    yaxis_title="Count",
    legend={
        "traceorder" : "normal"
    },
   # xaxis_range=[25000, 300000]
)
fig_ages.add_vline(x=df_customer_selected["AMT_INCOME_TOTAL"].values[0], line_width=3,
                   line_dash="dash", line_color="green", annotation_text="Client's income")
pyo.plot(fig_ages, config=config, filename="example5.html")

'example5.html'