In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [17]:
TRAIN_DATASET_PATH = '/home/alex/Downloads/course_project_train.csv'
TEST_DATASET_PATH = '/home/alex/Downloads/course_project_test.csv'

# Обзор данных

In [48]:
df = pd.read_csv(TRAIN_DATASET_PATH)
df.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
Home Ownership                  7500 non-null object
Annual Income                   7500 non-null float64
Years in current job            7500 non-null object
Tax Liens                       7500 non-null float64
Number of Open Accounts         7500 non-null float64
Years of Credit History         7500 non-null float64
Maximum Open Credit             7500 non-null float64
Number of Credit Problems       7500 non-null float64
Months since last delinquent    7500 non-null float64
Bankruptcies                    7500 non-null float64
Purpose                         7500 non-null object
Term                            7500 non-null object
Current Loan Amount             7500 non-null float64
Current Credit Balance          7500 non-null float64
Monthly Debt                    7500 non-null float64
Credit Score                    7500 non-null float64
Credit Default                  7

#### Целевая переменная

In [20]:
df['Credit Default'].value_counts()

0    5387
1    2113
Name: Credit Default, dtype: int64

#### Числовые переменные

In [96]:
df.describe()

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
count,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0
mean,1366392.0,0.030133,11.130933,18.317467,945153.7,0.17,34.6926,0.116933,309088.029067,289833.2,18314.454133,720.086993,0.281733
std,752481.2,0.271604,4.908924,7.041946,16026220.0,0.498598,14.642661,0.346904,171393.740131,317871.4,11926.764673,24.886214,0.449874
min,164597.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,11242.0,0.0,0.0,585.0,0.0
25%,931133.0,0.0,8.0,13.5,279229.5,0.0,34.6926,0.0,180169.0,114256.5,10067.5,715.0,0.0
50%,1366392.0,0.0,10.0,17.0,478159.0,0.0,34.6926,0.0,309567.5,209323.0,16076.5,720.086993,0.0
75%,1499974.0,0.0,14.0,21.8,793501.5,0.0,34.6926,0.0,396929.5,360406.2,23818.0,738.0,1.0
max,10149340.0,7.0,43.0,57.7,1304726000.0,7.0,118.0,4.0,789030.0,6506797.0,136679.0,751.0,1.0


#### Категориальные переменные

In [95]:
for col_name in df.select_dtypes(include='object').columns:
    print(col_name)
    print()
    print(df[col_name].value_counts())
    print()
    print()

Home Ownership

Home Mortgage    3637
Rent             3204
Own Home          647
Have Mortgage      12
Name: Home Ownership, dtype: int64


Years in current job

10+ years    2703
2 years       705
3 years       620
< 1 year      563
5 years       516
1 year        504
4 years       469
6 years       426
7 years       396
8 years       339
9 years       259
Name: Years in current job, dtype: int64


Purpose

debt consolidation      5944
other                    665
home improvements        412
business loan            129
buy a car                 96
medical bills             71
major purchase            40
take a trip               37
buy house                 34
small business            26
wedding                   15
moving                    11
educational expenses      10
vacation                   8
renewable energy           2
Name: Purpose, dtype: int64


Term

Short Term    5556
Long Term     1944
Name: Term, dtype: int64




### Обработаем явные выбросы в обучающей выборке

In [66]:
df.loc[df['Credit Score']>1000,'Credit Score']=df['Credit Score']*0.1
df.loc[df['Current Loan Amount']>1.000000e+06, 'Current Loan Amount']=df['Current Loan Amount'].median()

### Заполнение пропущенных данных

In [99]:
def fill_missing_value(df, source_df):
    df['Credit Score']=df['Credit Score'].fillna(source_df['Credit Score'].mean())
    df['Annual Income']=df['Annual Income'].fillna(source_df['Annual Income'].mean())
    df['Bankruptcies']=df['Bankruptcies'].fillna(source_df['Bankruptcies'].median())
    df['Months since last delinquent']=df['Months since last delinquent'].fillna(source_df['Months since last delinquent'].mean())
    imr = SimpleImputer(strategy="most_frequent")
    imr = imr.fit(df[['Years in current job']])
    df['Years in current job'] = imr.transform(df[['Years in current job']]).ravel()
    return df



In [100]:
df = fill_missing_value(df,df)