## Capstone 2: Feature engineering 

### Loading in the data

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("../Data/Processed/sba_cleaned.csv")
df.columns

Index(['Unnamed: 0', 'ID', 'Name', 'City', 'State', 'Zip', 'Bank', 'BankState',
       'NAICS', 'ApprovalDate', 'ApprovalFY', 'Term', 'NoEmp', 'NewExist',
       'CreateJob', 'RetainedJob', 'FranchiseCode', 'UrbanRural', 'LowDoc',
       'ChgOffDate', 'DisbursementDate', 'DisbursementGross', 'BalanceGross',
       'MIS_Status', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv'],
      dtype='object')

In [3]:
## Create a list of features to use for our model
## Note: for now, we are leaving out high-cardinality features
## such as city zip code. 
## In future iterations these could be encoded using hashing, 
## for example.
features = ['State', 'NAICS', 'ApprovalFY', 'Term', 'NoEmp', 'NewExist', 'FranchiseCode', 'LowDoc', 'MIS_Status', 'GrAppv', 'SBA_Appv']

In [4]:
df=df[features]

In [5]:
df.head()

Unnamed: 0,State,NAICS,ApprovalFY,Term,NoEmp,NewExist,FranchiseCode,LowDoc,MIS_Status,GrAppv,SBA_Appv
0,IN,451120.0,1997-01-01,84,4,2.0,0,Y,P I F,60000.0,48000.0
1,IN,722410.0,1997-01-01,60,2,2.0,0,Y,P I F,40000.0,32000.0
2,IN,621210.0,1997-01-01,180,7,1.0,0,N,P I F,287000.0,215250.0
3,OK,,1997-01-01,60,2,1.0,0,Y,P I F,35000.0,28000.0
4,FL,,1997-01-01,240,14,1.0,0,N,P I F,229000.0,229000.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898080 entries, 0 to 898079
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   State          898067 non-null  object 
 1   NAICS          697214 non-null  float64
 2   ApprovalFY     898080 non-null  object 
 3   Term           898080 non-null  int64  
 4   NoEmp          898080 non-null  int64  
 5   NewExist       896920 non-null  float64
 6   FranchiseCode  898080 non-null  int64  
 7   LowDoc         892073 non-null  object 
 8   MIS_Status     896106 non-null  object 
 9   GrAppv         898080 non-null  float64
 10  SBA_Appv       898080 non-null  float64
dtypes: float64(4), int64(3), object(4)
memory usage: 75.4+ MB


### Grouping categorical variables

#### Filtering Dates

We save the year from ApprovalFY as an integer, and filter any data from before 1990 or after 2010, as decided during the EDA step.

In [7]:
df['ApprovalFY'] = df['ApprovalFY'].apply(lambda x: int(x[:4]))

In [8]:
df = df.loc[(df['ApprovalFY'] >= 1990) & (df['ApprovalFY'] <= 2010)]

#### Franchise vs non-franchise businesses

We found during EDA that franchise businesses are slightly less likely to default on loans. We create a column to record whether a business is part of a franchise, and drop the high-cardinality FranchiseCode column.

In [9]:
df['is_franchise'] = df['FranchiseCode'] != 0
df=df.drop('FranchiseCode', axis=1)

#### Grouping industries

There are lots of industry classification codes, but the first two digits give the general industry. We then drop the origial NAICS column.

In [10]:
df['industry'] = df['NAICS'].apply(lambda x: str(x)[:2] if pd.notna(x) else x)
df=df.drop('NAICS', axis=1)
df.head()

Unnamed: 0,State,ApprovalFY,Term,NoEmp,NewExist,LowDoc,MIS_Status,GrAppv,SBA_Appv,is_franchise,industry
0,IN,1997,84,4,2.0,Y,P I F,60000.0,48000.0,False,45.0
1,IN,1997,60,2,2.0,Y,P I F,40000.0,32000.0,False,72.0
2,IN,1997,180,7,1.0,N,P I F,287000.0,215250.0,False,62.0
3,OK,1997,60,2,1.0,Y,P I F,35000.0,28000.0,False,
4,FL,1997,240,14,1.0,N,P I F,229000.0,229000.0,False,


#### Grouping states into regions

We now group states into major regions of the U.S. We will keep the `State` column for now, as its cardinality is high but not overwhelming.

In [11]:
region_dict = {'new_england' : ['CT', 'ME', 'MA', 'NH', 'RI', 'VT'],
               'mid_east' : ['DE', 'DC', 'MD', 'NJ', 'NY', 'PA'],
               'great_lakes' : ['IL', 'IN', 'MI', 'OH', 'WI'],
               'planes' : ['IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD'],
               'southeast' : ['AL', 'AR', 'FL', 'GA', 'KY', 'LA', 
                              'MS', 'NC', 'SC', 'TN', 'VA', 'WV'],
               'southwest' : ['AZ', 'NM', 'OK', 'TX'],
               'rocky_mountain' : ['CO', 'ID', 'MT', 'UT', 'WY'],
               'far_west' : ['AK', 'CA', 'HI', 'NV', 'OR', 'WA']}

region_by_state= {}
for region in region_dict.keys():
    for state in region_dict[region]:
        region_by_state[state] = region

In [12]:
df['region'] = df['State'].replace(region_by_state)

In [13]:
df.head()

Unnamed: 0,State,ApprovalFY,Term,NoEmp,NewExist,LowDoc,MIS_Status,GrAppv,SBA_Appv,is_franchise,industry,region
0,IN,1997,84,4,2.0,Y,P I F,60000.0,48000.0,False,45.0,great_lakes
1,IN,1997,60,2,2.0,Y,P I F,40000.0,32000.0,False,72.0,great_lakes
2,IN,1997,180,7,1.0,N,P I F,287000.0,215250.0,False,62.0,great_lakes
3,OK,1997,60,2,1.0,Y,P I F,35000.0,28000.0,False,,southwest
4,FL,1997,240,14,1.0,N,P I F,229000.0,229000.0,False,,southeast


### Dummy encoding for categorical variables

Let's first check that our categorical columns have appropriate datatype. For those that don't, cast them to type `object`.

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 849867 entries, 0 to 898079
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   State         849859 non-null  object 
 1   ApprovalFY    849867 non-null  int64  
 2   Term          849867 non-null  int64  
 3   NoEmp         849867 non-null  int64  
 4   NewExist      848799 non-null  float64
 5   LowDoc        844535 non-null  object 
 6   MIS_Status    847977 non-null  object 
 7   GrAppv        849867 non-null  float64
 8   SBA_Appv      849867 non-null  float64
 9   is_franchise  849867 non-null  bool   
 10  industry      668464 non-null  object 
 11  region        849859 non-null  object 
dtypes: bool(1), float64(3), int64(3), object(5)
memory usage: 78.6+ MB


In [15]:
## It looks like NewExist and is_franchise need to be converted
obj_cols = ['NewExist', 'is_franchise']
df[obj_cols] = df[obj_cols].astype('object')

In [16]:
df_dummies = pd.get_dummies(df)

### Standardizing columns

We scale each column in 

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()

In [19]:
X = df_dummies.drop(['MIS_Status_CHGOFF', 'MIS_Status_P I F'], axis=1)
y = df_dummies['MIS_Status_CHGOFF']

In [20]:
names = X.columns

In [21]:
X_scaled = scaler.fit_transform(X)

In [22]:
scaled_df = pd.DataFrame(X_scaled, columns=names)

### Split data into training and test sets

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
# We save 30% of the data for testing
X_train, X_test, y_train, y_test = train_test_split(scaled_df, y, test_size = 0.3)

### Save test and train data

Save the test and train data in the directory `Data/Processed` is csv format.

In [25]:
X_train.to_csv('../Data/Processed/X_train.csv', index=False)

In [26]:
y_train.to_csv('../Data/Processed/y_train.csv', index=False)

In [27]:
X_test.to_csv('../Data/Processed/X_test.csv', index=False)

In [28]:
y_test.to_csv('../Data/Processed/y_test.csv', index=False)