In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# LENDING CLUB

https://www.kaggle.com/janiobachmann/lending-club-risk-analysis-and-metrics

https://www.kaggle.com/wendykan/lending-club-loan-data?select=loan.csv

https://www.lendingclub.com/business/landing/

https://help.lendingclub.com/hc/en-us/articles/216127897-What-happens-when-a-loan-is-charged-off-

In [None]:
df = pd.read_csv('../data/USA/LC/loan.csv', low_memory=False)

df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
# Lets' transform the issue dates by year.
df['issue_d'].head()
dt_series = pd.to_datetime(df['issue_d'])
df['year'] = dt_series.dt.year

In [None]:
df.year.value_counts()

In [None]:
df["loan_status"].value_counts()

In [None]:
#we do not use grace period
bad_loan = ["Charged Off", "Default", "Does not meet the credit policy. Status:Charged Off", 
                "Late (16-30 days)", "Late (31-120 days)"]

#we do not use curren loans
good_loan = ["Fully Paid"]

df['loan_condition'] = np.nan

def loan_condition(status):
    if status in bad_loan:
        return 'Bad Loan'
    elif status in good_loan:
        return 'Good Loan'
    else:
        return 'Active'
    
    
df['loan_condition'] = df['loan_status'].apply(loan_condition)

In [None]:
df["loan_condition"].value_counts()

In [None]:
f, ax = plt.subplots(1,2, figsize=(16,6))

colors = ["#3791D7", "#7FFF00", "#D72626"]
labels ="Good Loans", "Active Loans", "Bad Loans" 

plt.suptitle('Information on Loan Conditions', fontsize=20)

df["loan_condition"].value_counts().plot.pie(explode=[0,0.05, 0.20], autopct='%1.2f%%', ax=ax[0], shadow=True, colors=colors, 
                                             labels=labels, fontsize=12, startangle=70)


# ax[0].set_title('State of Loan', fontsize=16)
ax[0].set_ylabel('% of Condition of Loans', fontsize=14)

# sns.countplot('loan_condition', data=df, ax=ax[1], palette=colors)
# ax[1].set_title('Condition of Loans', fontsize=20)
# ax[1].set_xticklabels(['Good', 'Bad'], rotation='horizontal')
palette = ["#7FFF00", "#3791D7", "#D72626"]

sns.barplot(x="year", y="loan_amnt", hue="loan_condition", data=df, palette=palette, estimator=lambda x: len(x) / len(df) * 100)
ax[1].set(ylabel="(%)")

In [None]:
df2 = df[df.loan_condition != 'Active'].copy().reset_index()

In [None]:
df2['addr_state'].unique()

In [None]:
states = df2[['addr_state', "loan_amnt"]].groupby(['addr_state']).sum().sort_values(by=['loan_amnt'], ascending=False).head(10)
states

In [None]:
f, ax = plt.subplots(1,2, figsize=(16,8))

cmap = plt.cm.coolwarm
cmap2 = plt.cm.coolwarm_r

loans_by_regGrade = df2.groupby(['addr_state', 'grade']).size()[list(states.index)]
loans_by_regGrade.unstack().plot(kind='bar', stacked=True, colormap=cmap, ax=ax[0], grid=False)

loans_by_regCond = df2.groupby(['addr_state', 'loan_condition']).size()[list(states.index)]
loans_by_regCond.unstack().plot(kind='bar', stacked=True, colormap=cmap, ax=ax[1], grid=False)

In [None]:
f, ax = plt.subplots(1,1, figsize=(16,8))

cmap = plt.cm.coolwarm_r

loans_by_regCond = df2[df2.grade.isin(['A','B'])].groupby(['addr_state', 'loan_condition']).size()[list(states.index)]
axes = loans_by_regCond.unstack().plot(kind='bar', stacked=True, colormap=cmap, ax=ax, grid=False)
state_pcts = loans_by_regCond.groupby(level=0).apply(lambda x: x / float(x.sum()))
v = list(state_pcts.xs('Bad Loan', level=1, drop_level=False))

def autolabel(axes, v):
    """
    Attach a text label above each bar displaying its height
    """   
    
    for i in range(10):
        
        height = axes.patches[i]._height+axes.patches[i+10]._height
        
        ax.text(i-0.2, 1000+height, '{}%'.format(round(v[i]*100,2)))

autolabel(axes, v)


In [None]:
df2.addr_state.value_counts()

Annual Income, Employment, loan amount, home, year of the loan

# Pre Process Functions

In [None]:
from sklearn import base

class ColumnSelectTransformer(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col_names):
        self.col_names = col_names  # We will need these in transform()
    
    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything about the data,
        # so it can just return self without any further processing
        return self
    
    def transform(self, X):
        # Return an array with the same number of rows as X and one
        # column for each in self.col_names
        
        X = X[self.col_names + ['loan_condition']].dropna().reset_index().copy()        
        return X[self.col_names], X['loan_condition']

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

class ScalerEncoderTransformer(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self):
        self.orColumns = []
        self.scColumns = []
        self.encColumns = []
        self.scaler = None     
        self.encoder = None
        self.feature_Names = None
    
    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything about the data,
        # so it can just return self without any further processing
        self.orColumns = X.columns
        self.scColumns = [x for x,y in zip(X.columns,X.dtypes) if y !=  'object']
        self.encColumns = [x for x in X.columns if x not in self.scColumns]
        self.scaler = StandardScaler()
        self.encoder = OneHotEncoder(handle_unknown='ignore')
        self.scaler.fit(X[self.scColumns])
        self.encoder.fit(X[self.encColumns])  
        self.feature_Names = self.scColumns + list(self.encoder.get_feature_names(self.encColumns))
        return self
    
    def transform(self, X):
        # Return an array with the same number of rows as X and one
        # column for each in self.col_names         
        scaled = self.scaler.transform(X[self.scColumns])
        encoded = self.encoder.transform(X[self.encColumns]).toarray()    
        return np.concatenate((scaled,encoded),axis=1)
       

# Predict

We can create a database y_SB, X = [default_knn_SA, SAB, DAB] 

We must build a similarity index SAB between states A, B.

We must build a direction value (AB vs. BA)

### Create Compare database

In [None]:
df2.head(2)

Lets use only a list of the first 30 states:

In [None]:
possible_states = ['CA','TX','NY','FL','IL','NJ','PA','OH','GA',
    'VA','NC','MI','AZ','MD','MA','CO','WA','MN','IN','MO','TN','NV','CT','WI','AL','OR','SC','LA','KY','OK']

Let create a compare list cmp_list = \[index, curr_state, compare_state\]

In [None]:
def sample_state(state):
    if state in possible_states:
        poss_states2 = [x for x in possible_states if x != state]
        c_state = np.random.choice(poss_states2, 1)[0]
        return state,c_state   
    else:
        return None,None

In [None]:
cmp_list = [[x,y[0],y[1]] for x,y in zip(df2.index, df2.addr_state.apply(sample_state))]

In [None]:
cmp_list[:10]

## Build Necessary Models


### Build a model for one person from Texas using California data

Build state databases, fit neighbors in that database, keep the reference in a dictionary:

In [None]:
knnvars = ['annual_inc','emp_length','loan_amnt','home_ownership','term', 'year']
cst = ColumnSelectTransformer(knnvars)

We use 100 neighbors:

In [None]:
dKnn = {}
n = 100

In [None]:
from sklearn.neighbors import NearestNeighbors
for s in possible_states:
    df_s = df2[df2.addr_state == s]
    X,y = cst.transform(df_s)
    cst2 = ScalerEncoderTransformer()    
    X2 = cst2.fit_transform(X)
    dKnn[s] = [NearestNeighbors(n_neighbors=n).fit(X2), y, cst2]  

Select a texas person:

In [None]:
txPerson = df2[df2.addr_state == 'TX'].iloc[[10]]
txPerson

Select the features we need:

In [None]:
txPerson, _ = cst.transform(txPerson)

We standarize the features using Scaler and Encoder fitted with California data:

In [None]:
txPerson2 = dKnn['CA'][2].transform(txPerson)[0]

Now we select 100 people who are similar, using California data:

In [None]:
distance, indices = dKnn['CA'][0].kneighbors([txPerson2])

We check the default vs non default in their outcome:

In [None]:
simPplndex = indices[0]
dKnn['CA'][1][simPplndex].value_counts()

The default probability is then:

In [None]:
sum(dKnn['CA'][1][simPplndex]=='Bad Loan')/len(dKnn['CA'][1][simPplndex])

The predicted probability of default is 24%

### Build a check model  (predict Texas using 1 Texas)

First we must **adjust our df for NAs**

In [None]:
knnvars = ['annual_inc','emp_length','loan_amnt','home_ownership','term', 'year']
possible_states = ['CA','TX','NY','FL','IL','NJ','PA','OH','GA',
    'VA','NC','MI','AZ','MD','MA','CO','WA','MN','IN','MO','TN','NV','CT','WI','AL','OR','SC','LA','KY','OK']

for c in knnvars:
    df2 = df2[df2[c].notna()]   #remove entries with na on my columns

We select the parameter as selection only 1 neighbor and fit the model:

In [None]:
dKnn = {}
n = 1

In [None]:
from sklearn.neighbors import NearestNeighbors

cst = ColumnSelectTransformer(knnvars)
for s in possible_states:
    df_s = df2[df2.addr_state == s]
    X,y = cst.transform(df_s)
    cst2 = ScalerEncoderTransformer()    
    X2 = cst2.fit_transform(X)
    dKnn[s] = [NearestNeighbors(n_neighbors=n).fit(X2), y, cst2] 

We select the first 100 people from Texas (Note we extract our true **y** from them):

In [None]:
txPersons = df2[df2.addr_state == 'TX'].head(100)
txPersons,y = cst.transform(txPersons)

We predict using our model for this 100 people:

In [None]:
probs = []
for i in range(txPersons.shape[0]):
    Person = txPersons.iloc[[i]]
    Person2 = dKnn['TX'][2].transform(Person)[0]
    distance, indices = dKnn['TX'][0].kneighbors([Person2])
    simPplndex = indices[0]    
    p = sum(dKnn['TX'][1][simPplndex]=='Bad Loan')/len(dKnn['TX'][1][simPplndex]) 
    probs.append(p)  
    
y_score = np.array(probs)

We transform original y to the 0-1 space

In [None]:
y = np.array(y.map({'Good Loan':0, 'Bad Loan':1}))

And we construct the ROC curve:

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y, y_score, pos_label=1)
roc_auc = auc(fpr, tpr)

import matplotlib.pyplot as plt
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

We obtain what we expected a perfect model

### Make a knn predict model in sklearn

Select a texas person:

In [None]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [None]:
from sklearn import base

class ColumnSelectTransformer(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col_names):
        self.col_names = col_names  # We will need these in transform()
    
    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything about the data,
        # so it can just return self without any further processing
        return self
    
    def transform(self, X):
        # Return an array with the same number of rows as X and one
        # column for each in self.col_names
        
        X = X[self.col_names + ['loan_condition']].dropna().reset_index().copy()        
        return X[self.col_names], X['loan_condition']

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

class ScalerEncoderTransformer(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self):
        self.orColumns = []
        self.scColumns = []
        self.encColumns = []
        self.scaler = None     
        self.encoder = None
        self.feature_Names = None
    
    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything about the data,
        # so it can just return self without any further processing
        self.orColumns = X.columns
        self.scColumns = [x for x,y in zip(X.columns,X.dtypes) if y !=  'object']
        self.encColumns = [x for x in X.columns if x not in self.scColumns]
        self.scaler = StandardScaler()
        self.encoder = OneHotEncoder(handle_unknown='ignore')
        self.scaler.fit(X[self.scColumns])
        self.encoder.fit(X[self.encColumns])  
        self.feature_Names = self.scColumns + list(self.encoder.get_feature_names(self.encColumns))
        return self
    
    def transform(self, X):
        # Return an array with the same number of rows as X and one
        # column for each in self.col_names         
        scaled = self.scaler.transform(X[self.scColumns])
        encoded = self.encoder.transform(X[self.encColumns]).toarray()    
        return np.concatenate((scaled,encoded),axis=1)

In [None]:
from sklearn import base
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.metrics import roc_auc_score

class knnEstimator(base.BaseEstimator, base.RegressorMixin):
    
    def __init__(self, knnVars=[], possible_states=[]):
        self.knnVars = knnVars
        self.possible_states = possible_states
        self.cst = ColumnSelectTransformer(self.knnVars)
        self.dKnn_ = {}

    
    def fit(self, X, n):
        # fit dictionary
        for s in self.possible_states:
            X2 = X[X.addr_state == s]
            X3,y = self.cst.transform(X2)
            cst2 = ScalerEncoderTransformer()    
            X4 = cst2.fit_transform(X3)
            self.dKnn_[s] = [NearestNeighbors(n_neighbors=n).fit(X4), y, cst2]   
            
    def pred_ind(self, Person, s):  #predict for Person using state s
        Person, _ = self.cst.transform(Person)
        Person2 = self.dKnn_[s][2].transform(Person)[0]
        distance, indices = self.dKnn_[s][0].kneighbors([Person2])
        simPplndex = indices[0]  #indices of similar people
        p = sum(self.dKnn_[s][1][simPplndex]=='Bad Loan')/len(self.dKnn_[s][1][simPplndex])  #probability of bad loan
        return p       
        
    
    def predict(self, X, states):
        probs = []
        for i in range(X.shape[0]):
            Person = X.iloc[[i]]
            p = self.pred_ind(Person, states[i])
            probs.append(p)              
        return np.array(probs)
    
    def score(self, X, states, y):
        y_scores = self.predict(X, states)
        return roc_auc_score(y, y_scores)   

In [None]:
knnvars = ['annual_inc','emp_length','loan_amnt','home_ownership','term', 'year']
possible_states = ['CA','TX','NY','FL','IL','NJ','PA','OH','GA',
    'VA','NC','MI','AZ','MD','MA','CO','WA','MN','IN','MO','TN','NV','CT','WI','AL','OR','SC','LA','KY','OK']

knnEst = knnEstimator(knnvars, possible_states)

In [None]:
knnEst.fit(df2, 1)

In [None]:
txPersons = df2[df2.addr_state == 'TX'].head(100)
states = ['TX']*len(txPersons)

In [None]:
y_score = knnEst.predict(txPersons, states)

In [None]:
y = np.array(txPersons['loan_condition'].map({'Good Loan':0, 'Bad Loan':1}))

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y, y_score, pos_label=1)
roc_auc = auc(fpr, tpr)

In [None]:
import matplotlib.pyplot as plt
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
knnEst.fit(df2, 100)

In [None]:
txPersons = df2[df2.addr_state == 'TX'].head(3000)
states = ['CA']*len(txPersons)

In [None]:
y_score = knnEst.predict(txPersons, states)

In [None]:
y = np.array(txPersons['loan_condition'].map({'Good Loan':0, 'Bad Loan':1}))

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y, y_score, pos_label=1)
roc_auc = auc(fpr, tpr)

In [None]:
import matplotlib.pyplot as plt
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
knnEst.score(txPersons, states, y)

### Fill the database using the models fitted with knn for each state

Fill using the pairs that we have

### Plots

Create plot from similarities, ex california

In [None]:
import pandas as pd
import numpy as np

df_state = pd.read_csv('../data/USA/State/state_data.csv', low_memory=False)

df_state.head(4)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
df_state2 = pd.DataFrame(scaler.fit_transform(df_state[['GDP_PC','gini','cost_living','hdi']]), 
                             columns=['GDP_PC','gini','cost_living','hdi'])
df_state2 = pd.concat([df_state[['N','State_Name','Code']], df_state2], axis=1)

In [None]:
from sklearn.externals import joblib  #save trained scaler
joblib.dump(scaler, '..\\app\\demo1\\data\\state_scaler.pkl')

In [None]:
df_state2.to_pickle('..\\app\\demo1\\data\\df_state2.pkl')  #save it to model for using it

Functions for predicting similarity

In [None]:
from scipy.spatial import distance
from sklearn.externals import joblib  #save trained scaler
import pandas as pd
import numpy as np

def GaussianKernel(v1, v2, sigma):
    l2norm = distance.euclidean(v1, v2)
    return np.exp(-l2norm**2/(2.*sigma**2))

def state_simil(df, x):
    df2 = df.copy()  #de reference
    sim_index = []
    for i in df2.index:
        x2 = np.array(df2.loc[i][3:])
        sim_index.append(GaussianKernel(x, x2, 1))
    df2['sim_index'] = sim_index
    return df2

scaler = joblib.load( '..\\app\\demo1\\data\\state_scaler.pkl')
df_state2 = pd.read_pickle('..\\app\\demo1\\data\\df_state2.pkl')

Lets compare California and see how the states compare to it:

In [None]:
x_cal0 = np.array([74205,0.4899,148.53,5.40])  #Original entry

In [None]:
x_cal = scaler.transform(x_cal0.reshape(1, -1))[0]
x_cal

In [None]:
df_state3 = state_simil(df_state2, x_cal)

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)


for col in df_state3.columns:
    df_state3[col] = df_state3[col].astype(str)
    
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(0,191,255)'],[0.4, 'rgb(30,144,255)'],\
                [0.6, 'rgb(0,0,255)'],[0.8, 'rgb(0,0,205)'],[1.0, 'rgb(0,0,139)']]



data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = df_state3['Code'],
        z = df_state3['sim_index'], 
        locationmode = 'USA-states',       
        marker = dict(
            line = dict (
                #color = 'rgb(0,0,0)',
                color = 'black',
                width = 1.5
            ) ),
        colorbar = dict(
            title = "%")
        ) ]


layout = dict(
    title = 'State Similarity<br>(Hover for breakdown)',
    geo = dict(
        scope = 'usa',
        projection=dict(type='albers usa'),
        showlakes = True,
        lakecolor = 'rgb(255, 255, 255)')
        
)

fig = dict(data=data, layout=layout)
iplot(fig, filename='d3-cloropleth-map')

Lets do it for Texas

In [None]:
x_tx = np.array(df_state2.loc[43][3:])
x_tx

In [None]:
df_state3 = state_simil(df_state2, x_tx)

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)


for col in df_state3.columns:
    df_state3[col] = df_state3[col].astype(str)
    
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(0,191,255)'],[0.4, 'rgb(30,144,255)'],\
                [0.6, 'rgb(0,0,255)'],[0.8, 'rgb(0,0,205)'],[1.0, 'rgb(0,0,139)']]



data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = df_state3['Code'],
        z = df_state3['sim_index'], 
        locationmode = 'USA-states',       
        marker = dict(
            line = dict (
                #color = 'rgb(0,0,0)',
                color = 'black',
                width = 1.5
            ) ),
        colorbar = dict(
            title = "%")
        ) ]


layout = dict(
    title = 'State Similarity<br>(Hover for breakdown)',
    geo = dict(
        scope = 'usa',
        projection=dict(type='albers usa'),
        showlakes = True,
        lakecolor = 'rgb(255, 255, 255)')
        
)

fig = dict(data=data, layout=layout)
iplot(fig, filename='d3-cloropleth-map')