In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
train = pd.read_csv(r'train.csv')
test = pd.read_csv(r'test.csv')

In [3]:
train['m13'].value_counts()

0    115422
1       636
Name: m13, dtype: int64

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116058 entries, 0 to 116057
Data columns (total 29 columns):
loan_id                     116058 non-null int64
source                      116058 non-null object
financial_institution       116058 non-null object
interest_rate               116058 non-null float64
unpaid_principal_bal        116058 non-null int64
loan_term                   116058 non-null int64
origination_date            116058 non-null object
first_payment_date          116058 non-null object
loan_to_value               116058 non-null int64
number_of_borrowers         116058 non-null float64
debt_to_income_ratio        116058 non-null float64
borrower_credit_score       116058 non-null float64
loan_purpose                116058 non-null object
insurance_percent           116058 non-null float64
co-borrower_credit_score    116058 non-null float64
insurance_type              116058 non-null float64
m1                          116058 non-null int64
m2                  

In [5]:
train['source'].value_counts()

X    63858
Y    37554
Z    14646
Name: source, dtype: int64

In [6]:
train['financial_institution'].value_counts()

OTHER                          49699
Browning-Hart                  31852
Swanson, Newton and Miller      6874
Edwards-Hoffman                 4857
Martinez, Duffy and Bird        4715
Miller, Mcclure and Allen       3158
Nicholson Group                 2116
Turner, Baldwin and Rhodes      1846
Suarez Inc                      1790
Cole, Brooks and Vincent        1642
Richards-Walters                1459
Taylor, Hunt and Rodriguez      1259
Sanchez-Robinson                1193
Sanchez, Hays and Wilkerson      853
Romero, Woods and Johnson        750
Thornton-Davis                   651
Anderson-Taylor                  483
Richardson Ltd                   473
Chapman-Mcmahon                  388
Name: financial_institution, dtype: int64

In [7]:
def financial_institution_process(val):
    if(val != 'OTHER' and val != 'Browning-Hart'):
        return 'OTHER-1'
    return val

In [8]:
train['origination_date'].value_counts()

2012-02-01    52334
2012-01-01    49093
2012-03-01    14631
Name: origination_date, dtype: int64

In [9]:
train['first_payment_date'].value_counts()

04/2012    52840
03/2012    47680
05/2012    15014
02/2012      524
Name: first_payment_date, dtype: int64

In [10]:
train['loan_purpose'].value_counts()

A23    58462
B12    29383
C86    28213
Name: loan_purpose, dtype: int64

In [11]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score
def report(actual, predicted):
    print(f1_score(actual, pred))
    print('\n')
    print(confusion_matrix(actual, pred))
    print('\n')
    print(classification_report(actual, pred))

In [12]:
from sklearn.preprocessing import normalize,scale
def fun_preprocess(train):
    #train.drop('co-borrower_credit_score', axis =1)
    source = pd.get_dummies(train['source'], drop_first = True)
    financial_institution = train['financial_institution'].apply(financial_institution_process)
    financial_institution = pd.get_dummies(financial_institution, drop_first = True)
    origination_date = pd.get_dummies(train['origination_date'], drop_first = True)
    train[train['first_payment_date'] == '02/2012']['first_payment_date'] = '03/2012'
    first_payment_date = pd.get_dummies(train['first_payment_date'], drop_first = True)
    loan_purpose = pd.get_dummies(train['loan_purpose'], drop_first = True)
    data = pd.concat([train.drop({'source', 'financial_institution', 'origination_date', 'first_payment_date', 'loan_purpose'}, axis =1)
                      , source, financial_institution, origination_date, first_payment_date, loan_purpose], axis = 1)
    #data['unpaid_principal_bal'] = np.log(data['unpaid_principal_bal'])
    #data['loan_term'] = np.log(data['loan_term'])
    #data['loan_to_value'] = np.log(data['loan_to_value'])
    #data['number_of_borrowers'] = np.log(data['number_of_borrowers'])
    #data['borrower_credit_score'] = np.log(data['borrower_credit_score'])
    #data['insurance_percent'] = np.log(data['insurance_percent'])
    #data['co-borrower_credit_score'] = np.log(data['co-borrower_credit_score'])
    #data['insurance_type'] = np.log(data['insurance_type'])
    
    return data

In [13]:
train_1 = fun_preprocess(train)
test_1 = fun_preprocess(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
train_1.columns

Index(['loan_id', 'interest_rate', 'unpaid_principal_bal', 'loan_term',
       'loan_to_value', 'number_of_borrowers', 'debt_to_income_ratio',
       'borrower_credit_score', 'insurance_percent',
       'co-borrower_credit_score', 'insurance_type', 'm1', 'm2', 'm3', 'm4',
       'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm11', 'm12', 'm13', 'Y', 'Z',
       'OTHER', 'OTHER-1', '2012-02-01', '2012-03-01', '03/2012', '04/2012',
       '05/2012', 'B12', 'C86'],
      dtype='object')

In [15]:
train_1.skew()

loan_id                      0.004114
interest_rate                0.051326
unpaid_principal_bal         0.897524
loan_term                   -0.692948
loan_to_value               -0.765251
number_of_borrowers         -0.379397
debt_to_income_ratio        -0.194303
borrower_credit_score       -3.570945
insurance_percent            2.743691
co-borrower_credit_score    -0.362473
insurance_type              17.436825
m1                          23.399482
m2                          32.194304
m3                          41.842040
m4                          45.131291
m5                          38.012865
m6                          41.184750
m7                          41.899434
m8                          40.627494
m9                          41.195572
m10                         39.421632
m11                         39.150505
m12                         38.346412
m13                         13.397436
Y                            0.754198
Z                            2.251391
OTHER       

In [16]:
train_1.drop('loan_id', axis = 1, inplace = True)

In [17]:
train_1.shape

(116058, 34)

In [18]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(train_1.drop('m13', axis =1), train_1['m13'], test_size = 0.3)

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_sample(train_x, train_y)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [19]:
X_train_res_df = pd.DataFrame(X_train_res, columns = [train_1.drop('m13', axis = 1).columns])

In [20]:
y_train_res_df = pd.DataFrame(y_train_res, columns = ['m13'])

In [21]:
train_2 = pd.concat([X_train_res_df, y_train_res_df], axis = 1)

In [22]:
train_2['m13'].value_counts()

1    80796
0    80796
Name: m13, dtype: int64

In [23]:
train_x, test_x, train_y, test_y = train_test_split(train_2.drop('m13', axis =1), train_2['m13'], test_size = 0.3, shuffle = True)

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
parameters = {'C': np.linspace(1, 10, 10)}

In [26]:
from sklearn.model_selection import GridSearchCV
lr = LogisticRegression()
grid = GridSearchCV(lr, parameters, cv = 5, verbose=5, n_jobs=6)
grid.fit(train_x, train_y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   17.6s
[Parallel(n_jobs=6)]: Done  50 out of  50 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=6)]: Done  50 out of  50 | elapsed:  1.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=6,
             param_grid={'C': array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=5)

In [27]:
lr = LogisticRegression(C = 1.0, penalty='l2', verbose=0)

In [28]:
lr.fit(train_x, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
pred = lr.predict(test_x)

In [30]:
report(test_y, pred)

0.8410104668109349


[[22235  2060]
 [ 5140 19043]]


              precision    recall  f1-score   support

           0       0.81      0.92      0.86     24295
           1       0.90      0.79      0.84     24183

    accuracy                           0.85     48478
   macro avg       0.86      0.85      0.85     48478
weighted avg       0.86      0.85      0.85     48478



In [39]:
submission_lr = pd.DataFrame()
submission_lr['loan_id'] = test['loan_id']

In [40]:
submission_lr['m13'] = lr.predict(test_1)

In [41]:
submission_lr.to_csv('submission_lr.csv', index = False)