In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [44]:
df = pd.read_csv('loan_cleaned.csv')

In [45]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,loan_amount,funded_amount,term(month),interest_rate,grade,emp_title,emp_length,annual_income,issue_time,loan_status,purpose,addr_state,inquries_fn,inquries_lastyear,year
0,0,5000.0,5000.0,36,10.65,B,other,10+ years,24000.0,2011-12-01,Fully Paid,credit_card,AZ,,,2011
1,1,2500.0,2500.0,60,15.27,C,Ryder,< 1 year,30000.0,2011-12-01,Charged Off,car,GA,,,2011
2,2,2400.0,2400.0,36,15.96,C,other,10+ years,12252.0,2011-12-01,Fully Paid,small_business,IL,,,2011
3,3,10000.0,10000.0,36,13.49,C,AIR RESOURCES BOARD,10+ years,49200.0,2011-12-01,Fully Paid,other,CA,,,2011
4,4,3000.0,3000.0,60,12.69,B,University Medical Group,1 year,80000.0,2011-12-01,Current,other,OR,,,2011


In [46]:
#import seaborn as sns; sns.set(color_codes=True)
fig, ax = plt.subplots(1, 2, figsize=(16,5))
sns.distplot(df['loan_amount'], ax=ax[0], color='r')
sns.distplot(df['funded_amount'], ax=ax[1], color='g')


The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.


The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.



<matplotlib.axes._subplots.AxesSubplot at 0x1a469aedd8>

## Good Loan & Bad Loan

In [47]:
df["loan_status"].value_counts()

Current                                                601779
Fully Paid                                             207723
Charged Off                                             45248
Late (31-120 days)                                      11591
Issued                                                   8460
In Grace Period                                          6253
Late (16-30 days)                                        2357
Does not meet the credit policy. Status:Fully Paid       1988
Default                                                  1219
Does not meet the credit policy. Status:Charged Off       761
Name: loan_status, dtype: int64

In [50]:
bad_loan = ["Charged Off",  "Late (16-30 days)", "Late (31-120 days)", "Does not meet the credit policy. Status:Charged Off", "In Grace Period"]

df['loan_evaluation'] = np.nan

def decide(status):
    if status in bad_loan:
        return 'Bad Loan'
    else:
        return 'Good Loan'
    
df['loan_evaluation'] = df['loan_status'].apply(decide)

In [51]:
df['loan_evaluation'].value_counts()

Good Loan    821169
Bad Loan      66210
Name: loan_evaluation, dtype: int64

In [52]:
sns.countplot(x="year", hue="loan_evaluation", data=df)

<matplotlib.axes._subplots.AxesSubplot at 0x1a469aedd8>

In [21]:
by_income = df.groupby(['addr_state'], as_index=False).annual_income.mean()

# Take the values to a list for visualization purposes.
states = by_income['addr_state'].values.tolist()
average_annual_income = by_income['annual_income'].values.tolist()

from collections import OrderedDict
metrics_data = OrderedDict([('state_codes', states),
                            ('annual_income', average_annual_income)])

metrics_df = pd.DataFrame.from_dict(metrics_data)
metrics_df = metrics_df.round(decimals=2)
metrics_df.head()

Unnamed: 0,state_codes,annual_income
0,AK,77009.76
1,AL,68532.35
2,AR,65818.68
3,AZ,71642.65
4,CA,79294.34


In [25]:
import plotly.plotly as py
import plotly.graph_objs as go
plotly.tools.set_credentials_file(username='xuxinzju1991', api_key='lQfhhDgTekwVCAXaNpMm')

for col in metrics_df.columns:
    metrics_df[col] = metrics_df[col].astype(str)
    
scl = [[0.0, 'rgb(210, 241, 198)'],[0.2, 'rgb(188, 236, 169)'],[0.4, 'rgb(171, 235, 145)'],\
            [0.6, 'rgb(140, 227, 105)'],[0.8, 'rgb(105, 201, 67)'],[1.0, 'rgb(59, 159, 19)']]

data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = metrics_df['state_codes'],
        z = metrics_df['annual_income'], 
        locationmode = 'USA-states',
        text = metrics_df['state_codes'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "$s USD")
        ) ]
layout = dict(
    title = 'Lending Clubs Client Income',
    geo = dict(
        scope = 'usa',
        projection=dict(type='albers usa'),
        showlakes = True,
        lakecolor = 'rgb(255, 255, 255)')
)

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='d3-cloropleth-map')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~xuxinzju1991/0 or inside your plot.ly account where it is named 'd3-cloropleth-map'


In [53]:
df['income_category'] = np.nan
lst = [df]

for col in lst:
    col.loc[col['annual_income'] <= 100000, 'income_category'] = 'Low'
    col.loc[(col['annual_income'] > 100000) & (col['annual_income'] <= 200000), 'income_category'] = 'Medium'
    col.loc[col['annual_income'] > 200000, 'income_category'] = 'High'

In [58]:
lst = [df]
df['loan_condition_int'] = np.nan
for col in lst:
    col.loc[df['loan_evaluation'] == 'Good Loan', 'loan_condition_int'] = 0 # Negative (Bad Loan)
    col.loc[df['loan_evaluation'] == 'Bad Loan', 'loan_condition_int'] = 1 # Positive (Good Loan)
    
# Convert from float to int the column (This is our label)  
df['loan_condition_int'] = df['loan_condition_int'].astype(int)

In [64]:
fig, (ax1, ax2, ax3)= plt.subplots(nrows=1, ncols=3, figsize=(14,6))

# Change the Palette types tomorrow!

sns.violinplot(x="income_category", y="loan_amount", data=df, palette="Set2", ax=ax1 )
sns.violinplot(x="income_category", y="loan_condition_int", data=df, palette="Set2", ax=ax2)
#sns.boxplot(x="income_category", y="emp_length_int", data=df, palette="Set2", ax=ax3)
sns.boxplot(x="income_category", y="interest_rate", data=df, palette="Set2", ax=ax3)

<matplotlib.axes._subplots.AxesSubplot at 0x1a17cbf630>

In [65]:
df['purpose'].value_counts()

debt_consolidation    524215
credit_card           206182
home_improvement       51829
other                  42894
major_purchase         17277
small_business         10377
car                     8863
medical                 8540
moving                  5414
vacation                4736
house                   3707
wedding                 2347
renewable_energy         575
educational              423
Name: purpose, dtype: int64

In [88]:
round(pd.crosstab(df['loan_evaluation'], df['purpose']).apply(lambda x: x/x.sum() * 100), 2)

purpose,car,credit_card,debt_consolidation,educational,home_improvement,house,major_purchase,medical,moving,other,renewable_energy,small_business,vacation,wedding
loan_evaluation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Bad Loan,6.61,5.59,7.79,20.8,6.76,11.06,7.41,9.24,11.01,9.55,12.52,16.98,8.32,12.31
Good Loan,93.39,94.41,92.21,79.2,93.24,88.94,92.59,90.76,88.99,90.45,87.48,83.02,91.68,87.69


In [98]:
purpose_condition = round(pd.crosstab(df['loan_evaluation'], df['purpose']).apply(lambda x: x/x.sum() * 100), 2)

purpose_bad_loans = purpose_condition.values[0].tolist()
purpose_good_loans = purpose_condition.values[1].tolist()
purpose = purpose_condition.columns
bad_plot = go.Bar(
    x=purpose,
    y=purpose_bad_loans,
    name = 'Bad Loans',
    text='%',
    marker=dict(
        color='rgba(219, 64, 82, 0.7)',
        line = dict(
            color='rgba(219, 64, 82, 1.0)',
            width=2
        )
    )
)
good_plot = go.Bar(
    x=purpose,
    y=purpose_good_loans,
    name='Good Loans',
    text='%',
    marker=dict(
        color='rgba(50, 171, 96, 0.7)',
        line = dict(
            color='rgba(50, 171, 96, 1.0)',
            width=2
        )
    )
)
data = [bad_plot, good_plot]

layout = go.Layout(
    title='Condition of Loan by Purpose',
    xaxis=dict(
        title=''
    ),
    yaxis=dict(
        title='% of the Loan',
    ),
    showlegend=True
)

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='condition_purposes')

In [107]:
df.to_csv('loan_for_ML.csv')