In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt, __version__
import sys

## Stack of technologies

In [29]:
print("Pandas version: ", pd.__version__)
print("Numpy version: ", np.__version__)
print("Matplotlib version: ", __version__)
print("Python version: ", sys.version)

Pandas version:  0.23.4
Numpy version:  1.15.4
Matplotlib version:  3.0.2
Python version:  3.6.6 | packaged by conda-forge | (default, Oct 12 2018, 07:24:56) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


In [30]:
credit_data = pd.read_csv('./data_in/credit-data.csv')

In [48]:
credit_data.sample(10)

Unnamed: 0,borrower_id,serious_dlqin2yrs,revolving_utilization_of_unsecured_lines,age,number_of_time30-59_days_past_due_not_worse,debt_ratio,monthly_income,number_of_open_credit_lines_and_loans,number_of_times90_days_late,number_real_estate_loans_or_lines,number_of_time60-89_days_past_due_not_worse,number_of_dependents
5068,5069,0,0.07212,48,0,1121.0,,6,0,1,0,2.0
142938,142939,0,0.252689,43,0,0.374278,8653.0,8,0,1,0,5.0
62670,62671,0,0.656648,61,0,0.358636,11908.0,11,0,1,0,1.0
149737,149738,1,0.611994,51,0,0.658104,5916.0,14,0,2,0,3.0
569,570,0,0.875441,37,0,0.437717,1436.0,5,0,0,0,1.0
58213,58214,0,0.009788,50,0,0.397652,8177.0,6,0,2,0,1.0
9023,9024,0,0.599709,46,0,0.631828,4850.0,9,0,1,0,0.0
49884,49885,0,0.155792,34,0,0.268481,7750.0,8,0,1,0,0.0
42746,42747,0,0.390412,37,1,0.296876,11300.0,15,0,2,0,4.0
81310,81311,0,0.0,39,0,0.273636,10100.0,11,0,2,0,3.0


In [32]:
credit_data = credit_data.rename(columns={'unnamed: 0': 'borrower_id'})

## Cleaning data

### Check types of dataframe column values

In [49]:
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 0 to 149999
Data columns (total 12 columns):
borrower_id                                    150000 non-null int64
serious_dlqin2yrs                              150000 non-null int64
revolving_utilization_of_unsecured_lines       150000 non-null float64
age                                            150000 non-null int64
number_of_time30-59_days_past_due_not_worse    150000 non-null int64
debt_ratio                                     150000 non-null float64
monthly_income                                 120269 non-null float64
number_of_open_credit_lines_and_loans          150000 non-null int64
number_of_times90_days_late                    150000 non-null int64
number_real_estate_loans_or_lines              150000 non-null int64
number_of_time60-89_days_past_due_not_worse    150000 non-null int64
number_of_dependents                           150000 non-null float64
dtypes: float64(4), int64(8)
memory usage: 14.9 MB


* Column **number_of_dependents** should be of type int64

In [51]:
credit_data['number_of_dependents'] = credit_data['number_of_dependents'].astype('int64')
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 0 to 149999
Data columns (total 12 columns):
borrower_id                                    150000 non-null int64
serious_dlqin2yrs                              150000 non-null int64
revolving_utilization_of_unsecured_lines       150000 non-null float64
age                                            150000 non-null int64
number_of_time30-59_days_past_due_not_worse    150000 non-null int64
debt_ratio                                     150000 non-null float64
monthly_income                                 120269 non-null float64
number_of_open_credit_lines_and_loans          150000 non-null int64
number_of_times90_days_late                    150000 non-null int64
number_real_estate_loans_or_lines              150000 non-null int64
number_of_time60-89_days_past_due_not_worse    150000 non-null int64
number_of_dependents                           150000 non-null int64
dtypes: float64(3), int64(9)
memory usage: 14.9 MB


### Check for duplicated entries

In [45]:
#Check size of the dataframe before drop duplicated rows
print(credit_data.shape)

(150000, 12)


In [44]:
#Drop duplicated rows
credit_data.drop_duplicates(keep='first', inplace=True)

In [46]:
#Check size of the dataframe after drop duplicated rows
print(credit_data.shape)

(150000, 12)


There are no duplicated entries in the dataset 

## Descriptive analysis

In [33]:
round(credit_data.describe(percentiles=[.01, .05, .10, .25, .50, .75, .90, .95, .99]), 2)

Unnamed: 0,borrower_id,serious_dlqin2yrs,revolving_utilization_of_unsecured_lines,age,number_of_time30-59_days_past_due_not_worse,debt_ratio,monthly_income,number_of_open_credit_lines_and_loans,number_of_times90_days_late,number_real_estate_loans_or_lines,number_of_time60-89_days_past_due_not_worse,number_of_dependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,150000.0
mean,75000.5,0.07,6.05,52.3,0.42,353.01,6670.22,8.45,0.27,1.02,0.24,0.74
std,43301.41,0.25,249.76,14.77,4.19,2037.82,14384.67,5.15,4.17,1.13,4.16,1.11
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1%,1500.99,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5%,7500.95,0.0,0.0,29.0,0.0,0.0,1300.0,2.0,0.0,0.0,0.0,0.0
10%,15000.9,0.0,0.0,33.0,0.0,0.03,2005.0,3.0,0.0,0.0,0.0,0.0
25%,37500.75,0.0,0.03,41.0,0.0,0.18,3400.0,5.0,0.0,0.0,0.0,0.0
50%,75000.5,0.0,0.15,52.0,0.0,0.37,5400.0,8.0,0.0,1.0,0.0,0.0
75%,112500.25,0.0,0.56,63.0,0.0,0.87,8249.0,11.0,0.0,2.0,0.0,1.0
