In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Context
Improve on the state of the art in credit scoring by predicting the probability that somebody will experience financial distress in the next two years.

Banks play a crucial role in market economies. They decide who can get finance and on what terms and can make or break investment decisions. For markets and society to function, individuals and companies need access to credit. 

Credit scoring algorithms, which make a guess at the probability of default, are the method banks use to determine whether or not a loan should be granted. This competition requires participants to improve on the state of the art in credit scoring, by predicting the probability that somebody will experience financial distress in the next two years.

The goal of this competition is to build a model that borrowers can use to help make the best financial decisions.

Historical data are provided on 250,000 borrowers and the prize pool is $5,000 ($3,000 for first, $1,500 for second and $500 for third).

[https://www.kaggle.com/c/GiveMeSomeCredit](http://)

In [None]:
sample = pd.read_csv('/kaggle/input/GiveMeSomeCredit/sampleEntry.csv')
credit_train = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv')
credit_test = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-test.csv')

## Data Exploration

In [None]:
credit_train.info()

In [None]:
credit_train.describe()

In [None]:
credit_train.head()

## EDA

In [None]:
sns.pairplot(credit_train)

In [None]:
credit_train.columns

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(credit_train.isnull(),yticklabels=False,cbar=False)

As we can see there is data missing from some rows, of No. of dependents and Monthly income.

In [None]:
plt.figure(figsize=(15,10))
sns.jointplot(data = credit_train, x= 'age', y = 'SeriousDlqin2yrs')

In [None]:
sns.distplot(credit_train['age'].dropna(),kde=False,color='darkred',bins=30)

In [None]:
sns.distplot(credit_train['DebtRatio'].dropna(),kde=False,color='darkred',bins=500)

In [None]:
sns.distplot(credit_train['MonthlyIncome'].dropna(),kde=False,color='darkred',bins=30)

In [None]:
credit_train.corr()

In [None]:
credit_train.head()

## Filling the missing data

In [None]:
print(credit_train.isnull().sum())

In [None]:
print(credit_test.isnull().sum())

In [None]:
credit_train['MonthlyIncome'].fillna(credit_train['MonthlyIncome'].mean(),inplace=True)

In [None]:
credit_test['MonthlyIncome'].fillna(credit_test['MonthlyIncome'].mean(),inplace=True)

In [None]:
credit_train['NumberOfDependents'].fillna(credit_train['NumberOfDependents'].mode()[0], inplace=True)

In [None]:
credit_test['NumberOfDependents'].fillna(credit_test['NumberOfDependents'].mode()[0], inplace=True)

In [None]:
credit_test['MonthlyIncome'].fillna(credit_test['MonthlyIncome'].mean(),inplace=True)

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(credit_train.isnull(),yticklabels=False,cbar=False)

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(credit_train.corr(),annot=True)

In [None]:
credit_train

In [None]:
credit_test

In [None]:
print(credit_train.isnull().sum())

In [None]:
print(credit_test.isnull().sum())

## Model Training

In [None]:
X_train = credit_train[['RevolvingUtilizationOfUnsecuredLines','age','NumberOfTime30-59DaysPastDueNotWorse','DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans','NumberOfTimes90DaysLate','NumberRealEstateLoansOrLines','NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']]
y_train = credit_train['SeriousDlqin2yrs']
X_test = credit_test[['RevolvingUtilizationOfUnsecuredLines','age','NumberOfTime30-59DaysPastDueNotWorse','DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans','NumberOfTimes90DaysLate','NumberRealEstateLoansOrLines','NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']]

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

In [None]:
rfc_pred = rfc.predict(X_test)
rfc_proba = rfc.predict_proba(X_test)

In [None]:
df=pd.DataFrame(rfc_proba,columns=['Id','Probability'])

In [None]:
df.head()

In [None]:
ind=credit_train['Unnamed: 0']
df['Id']=ind

In [None]:
df.head()

In [None]:
export_csv = df.to_csv('credit_score_random_forest.csv',index = None,header=True)