
included two new modules from sklearn

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
loan_df = pd.read_csv('D:/peronal/cfi/Student_Downloads/data/vehicle_loans_feat.csv', index_col='UNIQUEID')

In [4]:
# Types of variables
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 233154 entries, 420825 to 630213
Data columns (total 31 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   DISBURSED_AMOUNT                     233154 non-null  float64
 1   ASSET_COST                           233154 non-null  float64
 2   LTV                                  233154 non-null  float64
 3   MANUFACTURER_ID                      233154 non-null  int64  
 4   EMPLOYMENT_TYPE                      233154 non-null  object 
 5   STATE_ID                             233154 non-null  int64  
 6   AADHAR_FLAG                          233154 non-null  int64  
 7   PAN_FLAG                             233154 non-null  int64  
 8   VOTERID_FLAG                         233154 non-null  int64  
 9   DRIVING_FLAG                         233154 non-null  int64  
 10  PASSPORT_FLAG                        233154 non-null  int64  
 11  PERFORM_

In [5]:
# Used  dtypes to look at the variable types of categorical feilds
category_cols = ['MANUFACTURER_ID', 'STATE_ID', 'DISBURSAL_MONTH', 'DISBURSED_CAT', 'PERFORM_CNS_SCORE_DESCRIPTION', 'EMPLOYMENT_TYPE']
loan_df[category_cols].dtypes

MANUFACTURER_ID                   int64
STATE_ID                          int64
DISBURSAL_MONTH                   int64
DISBURSED_CAT                    object
PERFORM_CNS_SCORE_DESCRIPTION    object
EMPLOYMENT_TYPE                  object
dtype: object



- We can encode our categorical columns with the [category](https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html) data type

In [6]:
loan_df[category_cols] = loan_df[category_cols].astype('category')
loan_df[category_cols].dtypes


MANUFACTURER_ID                  category
STATE_ID                         category
DISBURSAL_MONTH                  category
DISBURSED_CAT                    category
PERFORM_CNS_SCORE_DESCRIPTION    category
EMPLOYMENT_TYPE                  category
dtype: object



-  selected the following 6 columns, 'STATE_ID', 'LTV', 'DISBURSED_CAT', 'PERFORM_CNS_SCORE', 'DISBURSAL_MONTH', 'LOAN_DEFAULT'


In [8]:
# selected 6 columns, 'STATE_ID', 'LTV', 'DISBURSED_CAT', 'PERFORM_CNS_SCORE', 'DISBURSAL_MONTH', 'LOAN_DEFAULT'
small_cols = ['STATE_ID', 'LTV', 'DISBURSED_CAT', 'PERFORM_CNS_SCORE', 'DISBURSAL_MONTH', 'LOAN_DEFAULT']

loan_df_sml = loan_df[small_cols]

In [9]:
loan_df_sml.shape

(233154, 6)

We still have 233154 rows but now there are only 6 columns

In [10]:
loan_df_sml.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 233154 entries, 420825 to 630213
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   STATE_ID           233154 non-null  category
 1   LTV                233154 non-null  float64 
 2   DISBURSED_CAT      233154 non-null  category
 3   PERFORM_CNS_SCORE  233154 non-null  float64 
 4   DISBURSAL_MONTH    233154 non-null  category
 5   LOAN_DEFAULT       233154 non-null  int64   
dtypes: category(3), float64(2), int64(1)
memory usage: 7.8 MB


In [11]:
#created two variables
x= loan_df_sml.drop(['LOAN_DEFAULT'], axis=1)
y = loan_df_sml['LOAN_DEFAULT']

We should investigate the dimensions of x and y to make sure the above solution is correct

In [12]:
print("x has {0} rows and {1} columns".format(x.shape[0], x.shape[1]))
print("y has {0} rows".format(y.count()))

x has 233154 rows and 5 columns
y has 233154 rows


In [13]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 233154 entries, 420825 to 630213
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   STATE_ID           233154 non-null  category
 1   LTV                233154 non-null  float64 
 2   DISBURSED_CAT      233154 non-null  category
 3   PERFORM_CNS_SCORE  233154 non-null  float64 
 4   DISBURSAL_MONTH    233154 non-null  category
dtypes: category(3), float64(2)
memory usage: 6.0 MB


In [23]:
y.dtype

dtype('int64')

In [14]:
#used train_test_split, test_size and random_state
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [15]:
print("x_train has {0} rows and {1} columns".format(x_train.shape[0], x_train.shape[1]))
print("x_test has {0} rows and {1} columns".format(x_test.shape[0], x_test.shape[1]))
print("y_train has {0} rows".format(y_train.count()))
print("y_test has {0} rows".format(y_test.count()))

x_train has 186523 rows and 5 columns
x_test has 46631 rows and 5 columns
y_train has 186523 rows
y_test has 46631 rows


In [16]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186523 entries, 633275 to 501520
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   STATE_ID           186523 non-null  category
 1   LTV                186523 non-null  float64 
 2   DISBURSED_CAT      186523 non-null  category
 3   PERFORM_CNS_SCORE  186523 non-null  float64 
 4   DISBURSAL_MONTH    186523 non-null  category
dtypes: category(3), float64(2)
memory usage: 4.8 MB


In [17]:
y_train.head()

UNIQUEID
633275    1
646002    0
591252    0
475736    0
639478    0
Name: LOAN_DEFAULT, dtype: int64

In [18]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46631 entries, 617183 to 626383
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   STATE_ID           46631 non-null  category
 1   LTV                46631 non-null  float64 
 2   DISBURSED_CAT      46631 non-null  category
 3   PERFORM_CNS_SCORE  46631 non-null  float64 
 4   DISBURSAL_MONTH    46631 non-null  category
dtypes: category(3), float64(2)
memory usage: 1.2 MB


In [19]:
y_test.head()

UNIQUEID
617183    1
515702    0
466872    0
632384    0
461426    0
Name: LOAN_DEFAULT, dtype: int64

In [20]:
# used value_counts to check the distribution of the class variable
y_train.value_counts(normalize=True)

0    0.783099
1    0.216901
Name: LOAN_DEFAULT, dtype: float64

In [21]:
y_test.value_counts(normalize=True)

0    0.782248
1    0.217752
Name: LOAN_DEFAULT, dtype: float64

In [22]:
# used logistic Regression
logistic_model = LogisticRegression()

In [36]:
# checked the fit of the training data
logistic_model.fit(x_train, y_train)

# It will not work and Error out as we still need to work on the catagorical columns

ValueError: could not convert string to float: '60k - 75k'

In [23]:
#used pd.get_dummies will provide binary values to the Cat. type columns and string are not handled by the regression model
loan_data_dumm = pd.get_dummies(loan_df_sml, prefix_sep='_', drop_first=True)

In [25]:
loan_data_dumm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 233154 entries, 420825 to 630213
Data columns (total 40 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   LTV                       233154 non-null  float64
 1   PERFORM_CNS_SCORE         233154 non-null  float64
 2   LOAN_DEFAULT              233154 non-null  int64  
 3   STATE_ID_2                233154 non-null  uint8  
 4   STATE_ID_3                233154 non-null  uint8  
 5   STATE_ID_4                233154 non-null  uint8  
 6   STATE_ID_5                233154 non-null  uint8  
 7   STATE_ID_6                233154 non-null  uint8  
 8   STATE_ID_7                233154 non-null  uint8  
 9   STATE_ID_8                233154 non-null  uint8  
 10  STATE_ID_9                233154 non-null  uint8  
 11  STATE_ID_10               233154 non-null  uint8  
 12  STATE_ID_11               233154 non-null  uint8  
 13  STATE_ID_12               233154 non-nu

In [26]:
print(loan_data_dumm['STATE_ID_13'].value_counts())
print(loan_data_dumm['STATE_ID_13'].value_counts(normalize=True))

print(loan_data_dumm['DISBURSAL_MONTH_10'].value_counts())
print(loan_data_dumm['DISBURSAL_MONTH_10'].value_counts(normalize=True))

0    215270
1     17884
Name: STATE_ID_13, dtype: int64
0    0.923295
1    0.076705
Name: STATE_ID_13, dtype: float64
0    148279
1     84875
Name: DISBURSAL_MONTH_10, dtype: int64
0    0.63597
1    0.36403
Name: DISBURSAL_MONTH_10, dtype: float64


In [27]:
x = loan_data_dumm.drop(['LOAN_DEFAULT'], axis=1)
y = loan_data_dumm['LOAN_DEFAULT']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

0    0.782975
1    0.217025
Name: LOAN_DEFAULT, dtype: float64
0    0.782821
1    0.217179
Name: LOAN_DEFAULT, dtype: float64


In [28]:
#Used fit model
logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
preds = logistic_model.predict(x_test)
preds

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [33]:
logistic_model.score(x_test, y_test)

0.7828212789683617