# Predicting Loan Outcomes: Baseline Models
## CS109A
### Team members: Maryam Alireza, Hector Cordero, Regina Legaretta, Pranav Sidhwani

### Loading libraries

In [10]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split

## Establish the Baseline Models 

What are the baseline models in this case? We can check off three basic models: 

1. a model that labels everything 1
2. a model that labels everything 0
3. a model that randomly guesses a label, 1 or 0
4. logistic regression model following Serrano-Cinca et al. (2015)


Before implementing anything fancy, we implement baseline models (1)-(3) and see how they do.

**Note:** Again, think about accuracy in a **meaningful** way.

Then we add Serrano-Cinca et al (2015)'s logistic regression model (4).

In their classificationn, the dependent variable is binary (e.g., default or non-default) and they include following explanatory variables:

- Borrower Assessment
    - Subgrade
    - Interest Rate
- Purpose
    - Car 
    - Credit Card 
    - Debt Consolidation
    - Educational 
    - Home Improvement
    - House
    - Major Purchase 
    - Medical
    - Moving 
    - Small Business 
    - Other
- Borrower Characteristics
    - Housing Situation: Own
    - Housing Situation: Mortgage
    - Housing Situation: Rent
    - Housing Situation: Other
    - Annual Income 
- Credit history
    - Inquiries Last 6 Months
    - Delinquency 2 Years
    - Public Records
    - Revolving
    - Utilization    
- Indebtedness
    - Loan Amount to Annual Income
    - Annual Instalment to Income*
    
*Not included (couldn't find it).

In [2]:
#Function for computing the accuracy a given model on the entire test set, the accuracy on class 0 in the test set
#and the accuracy on class 1
score = lambda model, x_test, y_test: pd.Series([model.score(x_test, y_test), 
                                                 model.score(x_test[y_test==0], y_test[y_test==0]),
                                                 model.score(x_test[y_test==1], y_test[y_test==1])],
                                                index=['overall accuracy', 'accuracy on class 0', 'accuracy on class 1'])

In [3]:
#A model that labels everything 1
class Pos_model(object):
    def predict(self, x):
        return np.array([1] * len(x))
    def score(self, x, y):
        y_pred = self.predict(x)
        y_err = y - y_pred
        return len(y_err[y_err == 0]) * 1. / len(y_err)
    
#A model that labels everything 0
class Neg_model(object):
    def predict(self, x):
        return np.array([0] * len(x))
    
    def score(self, x, y):
        y_pred = self.predict(x)
        y_err = y - y_pred
        return len(y_err[y_err == 0]) * 1. / len(y_err)

#A model that randomly labels things
class Random_model(object):
    def predict(self, x):
        return np.random.randint(0, 2, len(x))
    
    def score(self, x, y):
        y_pred = self.predict(x)
        y_err = y - y_pred
        return len(y_err[y_err == 0]) * 1. / len(y_err)

In [11]:
# Reading the data as a pandas dataframe
df = pd.read_csv('Data/loan.csv', low_memory=False)

# Looking at the first 5 observations
df.head(5)

IOError: File Data/loan.csv does not exist

In [5]:
# Keeping only inactive loans
df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off', 'Default'])]

# Explanatory variables used in the logistic regression model used in Serrano-Cinca et al (2015)
df = df[['int_rate', 'sub_grade', 'purpose', 'home_ownership', 'annual_inc', 'inq_last_6mths', 'delinq_2yrs', 'pub_rec', 'revol_bal', 'revol_util', 'dti', 'loan_status']]

sub_grade_dummies = pd.get_dummies(df['sub_grade']) 
df = df.join(sub_grade_dummies)
df = df.drop('sub_grade', axis=1)

purpose_dummies = pd.get_dummies(df['purpose']) 
df = df.join(purpose_dummies)
df= df.drop('purpose', axis=1)

home_ownership_dummies = pd.get_dummies(df['home_ownership']) 
df = df.join(home_ownership_dummies)
df = df.drop('home_ownership', axis=1)


# Creating new variable for default
df['default'] = df['loan_status'].isin(['Charged Off', 'Default'])
df.default = df.default.astype(int)

df = df.drop('loan_status', axis=1)
df.columns[pd.isnull(df).sum() > 0].tolist()

df = df.drop('revol_util', axis=1)
#We are droping "revol_util" for now because it has missing values, but we should add it in the final version.

df.head(5)
print df.columns

NameError: name 'df' is not defined

In [6]:
# Dividing the data into train and test
# Get numpy array from data
x = df.values[:, :-1]
y = df.values[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.6, random_state=42)

NameError: name 'df' is not defined

In [7]:
pos_model = Pos_model()
pos_model_scores = score(pos_model, x_test, y_test)

neg_model = Neg_model()
neg_model_scores = score(neg_model, x_test, y_test)

random_model = Random_model()
random_model_scores = score(random_model, x_test, y_test)

logit_model = LogisticRegression()
logit_model.fit(x_train, y_train)
logit_model_scores = score(logit_model, x_test, y_test)

NameError: name 'x_test' is not defined

In [8]:
#New Score Dataframe
score_df = pd.DataFrame({'pos model': pos_model_scores,
                         'neg model': neg_model_scores,
                         'random model': random_model_scores
                         #'logit model': logit_model_scores})
score_df

SyntaxError: invalid syntax (<ipython-input-8-b27bc0b829b3>, line 6)