In [15]:
#Installing TPOT
!pip install tpot



In [2]:
#Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor
from tpot import TPOTClassifier

In [3]:
#Reading the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head(5)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15772896.0,Pritchard,535.0,France,Male,44.0,8.0,108393.35,1.0,1.0,1.0,127528.84,0.0
1,1,15758063.0,Nkemakolam,549.0,France,Male,38.0,8.0,0.0,2.0,1.0,1.0,146891.07,0.0
2,2,15767231.0,Genovese,641.0,France,Male,34.0,5.0,0.0,2.0,0.0,1.0,171922.72,0.0
3,3,15762543.0,Nnachetam,692.0,Germany,Female,34.0,7.0,78892.23,1.0,1.0,1.0,121592.5,0.0
4,4,15747358.0,Chinwemma,681.0,France,Male,41.0,5.0,0.0,2.0,0.0,0.0,100791.36,0.0


In [5]:
#Dropping unneccessary columns
train.drop(['CustomerId', 'Surname'], axis=1, inplace=True)

In [6]:
#createing dummies for geography,gender
train = pd.get_dummies(train, columns=['Geography', 'Gender'])


In [7]:
#rename unnamed column to id
train = train.rename(columns={'Unnamed: 0': 'id'})
train.head(5)


Unnamed: 0,id,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,0,535.0,44.0,8.0,108393.35,1.0,1.0,1.0,127528.84,0.0,True,False,False,False,True
1,1,549.0,38.0,8.0,0.0,2.0,1.0,1.0,146891.07,0.0,True,False,False,False,True
2,2,641.0,34.0,5.0,0.0,2.0,0.0,1.0,171922.72,0.0,True,False,False,False,True
3,3,692.0,34.0,7.0,78892.23,1.0,1.0,1.0,121592.5,0.0,False,True,False,True,False
4,4,681.0,41.0,5.0,0.0,2.0,0.0,0.0,100791.36,0.0,True,False,False,False,True


In [9]:
#checking for null values
train.isnull().sum()

Unnamed: 0,0
id,0
CreditScore,0
Age,0
Tenure,0
Balance,0
NumOfProducts,0
HasCrCard,0
IsActiveMember,0
EstimatedSalary,0
Exited,0


In [10]:
target_col = "Exited"
X = train.loc[:, train.columns != target_col]
y = train[target_col]

In [11]:
#train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
#applying tpot classifier
tpot = TPOTClassifier(generations=15, population_size=40, verbosity=2)
tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/640 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9001990049751244

Generation 2 - Current best internal CV score: 0.9001990049751244

Generation 3 - Current best internal CV score: 0.9001990049751244

Generation 4 - Current best internal CV score: 0.9001990049751244

Generation 5 - Current best internal CV score: 0.900597014925373

Generation 6 - Current best internal CV score: 0.900597014925373

Generation 7 - Current best internal CV score: 0.900597014925373

Generation 8 - Current best internal CV score: 0.900597014925373

Generation 9 - Current best internal CV score: 0.900597014925373

Generation 10 - Current best internal CV score: 0.900597014925373

Generation 11 - Current best internal CV score: 0.900597014925373

Generation 12 - Current best internal CV score: 0.900597014925373

Generation 13 - Current best internal CV score: 0.9007960199004975

Generation 14 - Current best internal CV score: 0.9013930348258705

Generation 15 - Current best internal CV score: 0.90139303482587

In [13]:
#Evaluvating model performance
tpot.score(X_test, y_test)

0.9016161616161616

In [14]:
#exporting python file
tpot.export('tpot_boston_pipeline.py')

In [17]:
# TPOT auto-generated code
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from tpot.builtins import ZeroCount
from xgboost import XGBClassifier


# Average CV score on the training set was: 0.9013930348258705
exported_pipeline = make_pipeline(
    RobustScaler(),
    ZeroCount(),
    MinMaxScaler(),
    XGBClassifier(learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, n_jobs=1, subsample=0.9500000000000001, verbosity=0)
)



exported_pipeline.fit(X_train, y_train)
results = exported_pipeline.predict(X_test)


In [18]:
test = test.rename(columns={'Unnamed: 0': 'id'})
test.head(5)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15000,15590011.0,Hsieh,661.0,Germany,Male,32.0,5.0,113715.36,2.0,1.0,1.0,55816.2
1,15001,15769499.0,Lo Duca,525.0,Spain,Female,37.0,3.0,0.0,2.0,0.0,1.0,160129.99
2,15002,15783097.0,T'ien,585.0,Spain,Male,42.0,7.0,0.0,2.0,0.0,1.0,126160.24
3,15003,15626985.0,Ch'en,634.0,France,Male,39.0,2.0,0.0,2.0,1.0,0.0,163714.92
4,15004,15681924.0,Yeh,546.0,France,Male,40.0,5.0,114318.23,1.0,0.0,0.0,189558.44


In [19]:
test.drop(['CustomerId', 'Surname'], axis=1, inplace=True)

In [20]:
import pandas as pd
test = pd.get_dummies(test, columns=['Geography', 'Gender'])


In [21]:
submission = tpot.predict(test)

In [22]:
# Create the submission file
final = pd.DataFrame({'id': test['id'], 'Exited': submission})
final.to_csv('submission.csv', index = False)