# Task4: AutoML

# TPOT

TPOT is a python Automated Machine Learning Tool that optimizes machine learning pipelines using genetic programming(inspired by Darwinian Process of Natural Solutions i.e. finding out the fittest possible solution for optimization). It automates the most tedious part of machine learning i.e data preperation, feature selection , feature engineering,model selection and validation, hyperparameter tuning and outputs the optimal code for you when its done.

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor

# ignore warnings from pandas
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#Splitting the dataframe
loans = pd.read_csv(os.getcwd()+ "/Data/cleaned_data.csv", low_memory=False)

#target variable
labels = loans['int_rate']

#since regressor only uses numerical data eliminating the non-numerical data
features = loans.drop(['int_rate','grade','sub_grade','emp_length','home_ownership','verification_status','issue_d',
                      'loan_status','purpose','addr_state','application_type','earliest_cr_line'],axis=1)

X = np.array(features).astype(np.float64)
Y = np.array(labels).astype(np.float64)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25,random_state =42) #stratify = 'int_rate'



# Optimize pipelines for regression problems

In [None]:
tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2)
tpot.fit(X_train, Y_train)
print("TPOT Score for Testing set")
print(tpot.score(X_test, Y_test))
print("TPOT Score for Training set")
print(tpot.score(X_train, X_train))
tpot.export('tpot_loan_pipeline.py')

In [6]:
#Get the predicted values for this model
Y_predicted = tpot.predict((X_test))

In [7]:
#Calculate the MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

print('Mean Absolute Percentage Error:', mean_absolute_percentage_error(Y_test, Y_predicted))

('Mean Absolute Percentage Error:', 15.599448586333253)
