In [2]:
#update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [3]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [4]:
import pandas as pd

# Read CSV & Clean 

In [5]:
df = pd.read_csv("clean__filtered_loans.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,loan_amnt,term,grade,emp_length,home_ownership,annual_inc,loan_status,fico_range_high,application_type,emp_length_int
0,4,10400.0,60 months,F,3 years,MORTGAGE,104433.0,Fully Paid,699.0,Individual,3.0
1,5,11950.0,36 months,C,4 years,RENT,34000.0,Fully Paid,694.0,Individual,4.0
2,8,10000.0,36 months,A,6 years,RENT,85000.0,Fully Paid,689.0,Individual,6.0
3,10,22400.0,60 months,C,6 years,MORTGAGE,95000.0,Current,714.0,Individual,6.0
4,11,16000.0,60 months,C,1 year,MORTGAGE,70000.0,Current,724.0,Individual,1.0


In [6]:
df_clean = df.drop(['emp_length', 'Unnamed: 0', 'home_ownership'], axis=1)
df_clean.dropna(how='any')
df_clean = df_clean[(df_clean['loan_status'] == 'Fully Paid') | (df_clean['loan_status'] == 'Charged Off')]
df_clean

Unnamed: 0,loan_amnt,term,grade,annual_inc,loan_status,fico_range_high,application_type,emp_length_int
0,10400.0,60 months,F,104433.0,Fully Paid,699.0,Individual,3.0
1,11950.0,36 months,C,34000.0,Fully Paid,694.0,Individual,4.0
2,10000.0,36 months,A,85000.0,Fully Paid,689.0,Individual,6.0
5,1400.0,36 months,C,64000.0,Fully Paid,704.0,Individual,3.0
6,18000.0,60 months,E,150000.0,Charged Off,669.0,Individual,7.0
...,...,...,...,...,...,...,...,...
1512684,36400.0,60 months,C,95000.0,Charged Off,724.0,Individual,5.0
1512686,18000.0,60 months,B,130000.0,Fully Paid,739.0,Individual,5.0
1512688,29400.0,60 months,C,180792.0,Fully Paid,709.0,Individual,9.0
1512689,32000.0,60 months,C,157000.0,Charged Off,739.0,Individual,3.0


In [7]:
clean_1 = pd.get_dummies(df_clean, columns=["term","loan_status", "application_type",], drop_first=True)
clean_1

Unnamed: 0,loan_amnt,grade,annual_inc,fico_range_high,emp_length_int,term_ 60 months,loan_status_Fully Paid,application_type_Joint App
0,10400.0,F,104433.0,699.0,3.0,1,1,0
1,11950.0,C,34000.0,694.0,4.0,0,1,0
2,10000.0,A,85000.0,689.0,6.0,0,1,0
5,1400.0,C,64000.0,704.0,3.0,0,1,0
6,18000.0,E,150000.0,669.0,7.0,1,0,0
...,...,...,...,...,...,...,...,...
1512684,36400.0,C,95000.0,724.0,5.0,1,0,0
1512686,18000.0,B,130000.0,739.0,5.0,1,1,0
1512688,29400.0,C,180792.0,709.0,9.0,1,1,0
1512689,32000.0,C,157000.0,739.0,3.0,1,0,0


In [8]:
print(clean_1.dtypes)

loan_amnt                     float64
grade                          object
annual_inc                    float64
fico_range_high               float64
emp_length_int                float64
term_ 60 months                 uint8
loan_status_Fully Paid          uint8
application_type_Joint App      uint8
dtype: object


In [9]:
clean_1 = clean_1.replace({'grade': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}})
clean_1 = clean_1.dropna()
clean_1

Unnamed: 0,loan_amnt,grade,annual_inc,fico_range_high,emp_length_int,term_ 60 months,loan_status_Fully Paid,application_type_Joint App
0,10400.0,5,104433.0,699.0,3.0,1,1,0
1,11950.0,2,34000.0,694.0,4.0,0,1,0
2,10000.0,0,85000.0,689.0,6.0,0,1,0
5,1400.0,2,64000.0,704.0,3.0,0,1,0
6,18000.0,4,150000.0,669.0,7.0,1,0,0
...,...,...,...,...,...,...,...,...
1512684,36400.0,2,95000.0,724.0,5.0,1,0,0
1512686,18000.0,1,130000.0,739.0,5.0,1,1,0
1512688,29400.0,2,180792.0,709.0,9.0,1,1,0
1512689,32000.0,2,157000.0,739.0,3.0,1,0,0


# Select Features (columns)

In [10]:
# Set features. This will also be used as your x values.
selected_features = clean_1.drop(['grade'], axis = 1)

# Create Train Test Split

Use 'grade' for the y values

In [11]:
# set X and y variables
x = selected_features # Features
y = clean_1['grade'] # Target variable
print(x.shape, y.shape)

(824600, 7) (824600,)


In [12]:
# Split dataset into training set and test set
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [13]:
x_train.head()

Unnamed: 0,loan_amnt,annual_inc,fico_range_high,emp_length_int,term_ 60 months,loan_status_Fully Paid,application_type_Joint App
614635,7000.0,42264.0,699.0,1.0,0,1,0
1169108,11000.0,104000.0,664.0,1.0,0,1,0
202356,16000.0,60000.0,694.0,4.0,1,0,0
244712,20000.0,96000.0,694.0,0.0,0,1,0
38530,13075.0,150000.0,679.0,3.0,0,1,0


# Pre-Processing

In [14]:
# Scale your data
x_scaler = MinMaxScaler().fit(x_train)
x_train_scaled = x_scaler.transform(x_train)
x_test_scaled = x_scaler.transform(x_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

# Train the Model

In [20]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10)
model.fit(x_train_scaled, y_train)
predictions = model.predict(x_test_scaled)
print(predictions)

[2 1 1 ... 2 1 4]


In [21]:
print(f"Training Data Score: {model.score(x_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(x_test_scaled, y_test)}")

Training Data Score: 0.7838046729727545
Testing Data Score: 0.34629638612660685


In [24]:
import joblib
filename = 'random_forest_model.pkl'
joblib.dump(model, filename)

['random_forest_model.pkl']

In [25]:
#save scaler
joblib.dump(x_scaler, 'scaler.pkl')

['scaler.pkl']