##### Prediction for Term Deposit Subscription

  ---- By Shreya Bhatnagar

In [1]:
# IMPORTING LIBRARIES

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# IGNORE WARNINGS
import warnings
warnings.filterwarnings("ignore")

In [2]:
# ACCESSING TRAINING DATA

train = pd.read_csv('training_data.csv')

In [3]:
# PRINTING DATA

train

Unnamed: 0,ID,AGE,JOB,MARITAL,EDUCATION,DEFAULT,BALANCE,HOUSING,LOAN,CONTACT,DAY,MONTH,DURATION,CAMPAIGN,PDAYS,PREVIOUS,POUTCOME,TARGET
0,cl/26988,40,services,married,secondary,no,10406,no,no,cellular,21,nov,348,2,127,4,other,no
1,cl/44045,77,retired,married,tertiary,no,1047,no,no,cellular,30,jun,108,5,-1,0,,no
2,cl/535,45,blue-collar,divorced,secondary,no,756,yes,no,,6,may,179,2,-1,0,,no
3,cl/39433,27,management,single,tertiary,no,616,yes,no,cellular,22,may,685,1,101,1,other,yes
4,cl/5241,36,management,married,tertiary,no,8564,yes,no,,23,may,125,1,-1,0,,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31642,cl/12363,53,services,married,primary,no,931,no,no,,27,jun,159,2,-1,0,,no
31643,cl/5695,50,admin.,single,secondary,no,1651,yes,no,,26,may,41,2,-1,0,,no
31644,cl/8006,33,technician,single,secondary,no,-244,yes,no,,2,jun,63,2,-1,0,,no
31645,cl/17745,52,management,divorced,tertiary,no,-299,no,no,cellular,29,jul,110,2,-1,0,,no


In [4]:
# PRINTING SHAPE

train.shape

(31647, 18)

In [5]:
# PRINTING COLUMN NAMES

train.columns

Index(['ID', 'AGE', 'JOB', 'MARITAL', 'EDUCATION', 'DEFAULT', 'BALANCE',
       'HOUSING', 'LOAN', 'CONTACT', 'DAY', 'MONTH', 'DURATION', 'CAMPAIGN',
       'PDAYS', 'PREVIOUS', 'POUTCOME', 'TARGET'],
      dtype='object')

In [6]:
# EDITING TARGET VARIABLE

train.TARGET.replace(('yes', 'no'), (1, 0), inplace=True)


In [7]:
# TRAIN TEST SPLIT
train_X_variables = ['AGE', 'JOB', 'MARITAL', 'EDUCATION', 'DEFAULT', 'BALANCE',
       'HOUSING', 'DAY', 'LOAN','CONTACT', 'MONTH', 'DURATION', 'CAMPAIGN',
       'PDAYS', 'PREVIOUS', 'POUTCOME']
train_y_variables = ['TARGET']
X_train,X_test,y_train,y_test = train_test_split(train[train_X_variables], train[train_y_variables], test_size=0.33, random_state=1)

In [8]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

       AGE            JOB  MARITAL  EDUCATION DEFAULT  BALANCE HOUSING  DAY  \
31494   36         admin.  married  secondary      no        0      no   26   
1715    40    blue-collar  married  secondary      no     1313      no   14   
13701   30         admin.   single  secondary      no       72      no    3   
30868   49     management   single   tertiary      no     7443      no   19   
21217   58    blue-collar  married    primary      no     1596     yes   12   
...    ...            ...      ...        ...     ...      ...     ...  ...   
17289   29       services   single  secondary      no      487     yes    9   
5192    26    blue-collar  married    primary      no        0     yes   21   
12172   49  self-employed  married  secondary      no     -199     yes   29   
235     50    blue-collar  married    primary      no      111     yes   15   
29733   52    blue-collar  married    unknown      no     2171     yes   17   

      LOAN    CONTACT MONTH  DURATION  CAMPAIGN  PD

###### Cleaning Data

<I><u>Getting Catagorical and Numerical Columns to impute</u></I>

In [9]:
# Getting Catagorical Columns

s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

# Getting Numerical Columns

s = (X_train.dtypes != 'object')
num_cols = list(s[s].index)

#printing to check

print(object_cols)
print(num_cols)

['JOB', 'MARITAL', 'EDUCATION', 'DEFAULT', 'HOUSING', 'LOAN', 'CONTACT', 'MONTH', 'POUTCOME']
['AGE', 'BALANCE', 'DAY', 'DURATION', 'CAMPAIGN', 'PDAYS', 'PREVIOUS']


In [10]:
# Numeric Transformer

numerical_transformer = SimpleImputer(strategy='mean')

# Catagorical Transformer

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, object_cols)
    ])


In [11]:
#!pip install catboost
from catboost import CatBoostClassifier

model_cat = CatBoostClassifier(custom_loss=['Accuracy'],random_seed=42,loss_function = 'CrossEntropy')


In [12]:
# APPLYING PIPELINE

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model_cat)
                             ])
my_pipeline.fit(X_train, y_train)
preds = my_pipeline.predict(X_test)
preds_training = my_pipeline.predict(X_train)

print(preds)
print(sum(preds))
print(len(preds) - sum(preds))
print((sum(preds))/(len(preds) - sum(preds)))

0:	learn: 0.6589964	total: 125ms	remaining: 2m 5s
1:	learn: 0.6277992	total: 148ms	remaining: 1m 13s
2:	learn: 0.5946908	total: 164ms	remaining: 54.6s
3:	learn: 0.5693062	total: 203ms	remaining: 50.6s
4:	learn: 0.5436121	total: 222ms	remaining: 44.2s
5:	learn: 0.5211868	total: 240ms	remaining: 39.7s
6:	learn: 0.5002355	total: 257ms	remaining: 36.5s
7:	learn: 0.4832766	total: 275ms	remaining: 34.1s
8:	learn: 0.4629565	total: 292ms	remaining: 32.2s
9:	learn: 0.4462167	total: 322ms	remaining: 31.8s
10:	learn: 0.4323106	total: 341ms	remaining: 30.6s
11:	learn: 0.4174953	total: 365ms	remaining: 30.1s
12:	learn: 0.4054981	total: 387ms	remaining: 29.4s
13:	learn: 0.3949535	total: 407ms	remaining: 28.7s
14:	learn: 0.3834873	total: 427ms	remaining: 28s
15:	learn: 0.3748613	total: 447ms	remaining: 27.5s
16:	learn: 0.3678651	total: 475ms	remaining: 27.5s
17:	learn: 0.3605342	total: 504ms	remaining: 27.5s
18:	learn: 0.3522851	total: 533ms	remaining: 27.5s
19:	learn: 0.3443550	total: 553ms	remainin

168:	learn: 0.2107187	total: 3.92s	remaining: 19.3s
169:	learn: 0.2105790	total: 3.95s	remaining: 19.3s
170:	learn: 0.2104577	total: 3.97s	remaining: 19.2s
171:	learn: 0.2103221	total: 3.99s	remaining: 19.2s
172:	learn: 0.2101794	total: 4.01s	remaining: 19.2s
173:	learn: 0.2099889	total: 4.04s	remaining: 19.2s
174:	learn: 0.2096149	total: 4.06s	remaining: 19.1s
175:	learn: 0.2094806	total: 4.09s	remaining: 19.1s
176:	learn: 0.2093143	total: 4.16s	remaining: 19.3s
177:	learn: 0.2091063	total: 4.17s	remaining: 19.3s
178:	learn: 0.2089055	total: 4.19s	remaining: 19.2s
179:	learn: 0.2086187	total: 4.21s	remaining: 19.2s
180:	learn: 0.2084429	total: 4.23s	remaining: 19.1s
181:	learn: 0.2083205	total: 4.25s	remaining: 19.1s
182:	learn: 0.2082089	total: 4.27s	remaining: 19.1s
183:	learn: 0.2080125	total: 4.3s	remaining: 19.1s
184:	learn: 0.2078958	total: 4.32s	remaining: 19s
185:	learn: 0.2077525	total: 4.34s	remaining: 19s
186:	learn: 0.2074231	total: 4.36s	remaining: 18.9s
187:	learn: 0.207

331:	learn: 0.1909058	total: 7.36s	remaining: 14.8s
332:	learn: 0.1908264	total: 7.38s	remaining: 14.8s
333:	learn: 0.1906928	total: 7.4s	remaining: 14.8s
334:	learn: 0.1905588	total: 7.41s	remaining: 14.7s
335:	learn: 0.1904733	total: 7.43s	remaining: 14.7s
336:	learn: 0.1903348	total: 7.45s	remaining: 14.7s
337:	learn: 0.1902274	total: 7.47s	remaining: 14.6s
338:	learn: 0.1901535	total: 7.49s	remaining: 14.6s
339:	learn: 0.1900669	total: 7.5s	remaining: 14.6s
340:	learn: 0.1899057	total: 7.52s	remaining: 14.5s
341:	learn: 0.1898331	total: 7.54s	remaining: 14.5s
342:	learn: 0.1897086	total: 7.56s	remaining: 14.5s
343:	learn: 0.1896398	total: 7.57s	remaining: 14.4s
344:	learn: 0.1895677	total: 7.59s	remaining: 14.4s
345:	learn: 0.1894544	total: 7.61s	remaining: 14.4s
346:	learn: 0.1892977	total: 7.63s	remaining: 14.4s
347:	learn: 0.1892095	total: 7.64s	remaining: 14.3s
348:	learn: 0.1891099	total: 7.66s	remaining: 14.3s
349:	learn: 0.1890433	total: 7.68s	remaining: 14.3s
350:	learn: 0.

490:	learn: 0.1770512	total: 10.7s	remaining: 11.1s
491:	learn: 0.1769761	total: 10.7s	remaining: 11.1s
492:	learn: 0.1769175	total: 10.8s	remaining: 11.1s
493:	learn: 0.1768162	total: 10.8s	remaining: 11.1s
494:	learn: 0.1767547	total: 10.8s	remaining: 11s
495:	learn: 0.1766475	total: 10.8s	remaining: 11s
496:	learn: 0.1765764	total: 10.9s	remaining: 11s
497:	learn: 0.1765017	total: 10.9s	remaining: 11s
498:	learn: 0.1764261	total: 10.9s	remaining: 11s
499:	learn: 0.1763498	total: 10.9s	remaining: 10.9s
500:	learn: 0.1762522	total: 11s	remaining: 10.9s
501:	learn: 0.1762019	total: 11s	remaining: 10.9s
502:	learn: 0.1761467	total: 11s	remaining: 10.9s
503:	learn: 0.1760810	total: 11s	remaining: 10.8s
504:	learn: 0.1759381	total: 11s	remaining: 10.8s
505:	learn: 0.1758751	total: 11.1s	remaining: 10.8s
506:	learn: 0.1758102	total: 11.1s	remaining: 10.8s
507:	learn: 0.1757029	total: 11.1s	remaining: 10.8s
508:	learn: 0.1756168	total: 11.1s	remaining: 10.7s
509:	learn: 0.1755539	total: 11.

657:	learn: 0.1664146	total: 14.3s	remaining: 7.44s
658:	learn: 0.1663596	total: 14.3s	remaining: 7.42s
659:	learn: 0.1663125	total: 14.4s	remaining: 7.39s
660:	learn: 0.1662540	total: 14.4s	remaining: 7.37s
661:	learn: 0.1661764	total: 14.4s	remaining: 7.35s
662:	learn: 0.1661300	total: 14.4s	remaining: 7.33s
663:	learn: 0.1660466	total: 14.4s	remaining: 7.3s
664:	learn: 0.1659940	total: 14.5s	remaining: 7.28s
665:	learn: 0.1659400	total: 14.5s	remaining: 7.26s
666:	learn: 0.1658639	total: 14.5s	remaining: 7.24s
667:	learn: 0.1658005	total: 14.5s	remaining: 7.22s
668:	learn: 0.1657272	total: 14.5s	remaining: 7.2s
669:	learn: 0.1656893	total: 14.6s	remaining: 7.17s
670:	learn: 0.1656383	total: 14.6s	remaining: 7.15s
671:	learn: 0.1655826	total: 14.6s	remaining: 7.13s
672:	learn: 0.1655097	total: 14.6s	remaining: 7.11s
673:	learn: 0.1654812	total: 14.7s	remaining: 7.09s
674:	learn: 0.1654270	total: 14.7s	remaining: 7.07s
675:	learn: 0.1653838	total: 14.7s	remaining: 7.04s
676:	learn: 0.

820:	learn: 0.1583256	total: 17.7s	remaining: 3.85s
821:	learn: 0.1582875	total: 17.7s	remaining: 3.83s
822:	learn: 0.1582131	total: 17.7s	remaining: 3.81s
823:	learn: 0.1581750	total: 17.7s	remaining: 3.79s
824:	learn: 0.1581493	total: 17.8s	remaining: 3.77s
825:	learn: 0.1580994	total: 17.8s	remaining: 3.74s
826:	learn: 0.1580649	total: 17.8s	remaining: 3.72s
827:	learn: 0.1580388	total: 17.8s	remaining: 3.7s
828:	learn: 0.1579977	total: 17.8s	remaining: 3.68s
829:	learn: 0.1579512	total: 17.9s	remaining: 3.66s
830:	learn: 0.1579244	total: 17.9s	remaining: 3.63s
831:	learn: 0.1578122	total: 17.9s	remaining: 3.61s
832:	learn: 0.1577706	total: 17.9s	remaining: 3.59s
833:	learn: 0.1577195	total: 17.9s	remaining: 3.57s
834:	learn: 0.1576818	total: 18s	remaining: 3.55s
835:	learn: 0.1576413	total: 18s	remaining: 3.53s
836:	learn: 0.1575954	total: 18s	remaining: 3.51s
837:	learn: 0.1575481	total: 18s	remaining: 3.48s
838:	learn: 0.1575115	total: 18s	remaining: 3.46s
839:	learn: 0.1574653	t

981:	learn: 0.1516077	total: 21.1s	remaining: 386ms
982:	learn: 0.1515424	total: 21.1s	remaining: 365ms
983:	learn: 0.1515114	total: 21.1s	remaining: 343ms
984:	learn: 0.1514482	total: 21.1s	remaining: 322ms
985:	learn: 0.1513868	total: 21.1s	remaining: 300ms
986:	learn: 0.1513495	total: 21.2s	remaining: 279ms
987:	learn: 0.1512745	total: 21.2s	remaining: 257ms
988:	learn: 0.1512243	total: 21.2s	remaining: 236ms
989:	learn: 0.1511506	total: 21.2s	remaining: 214ms
990:	learn: 0.1510744	total: 21.2s	remaining: 193ms
991:	learn: 0.1510093	total: 21.3s	remaining: 172ms
992:	learn: 0.1509546	total: 21.3s	remaining: 150ms
993:	learn: 0.1508976	total: 21.3s	remaining: 129ms
994:	learn: 0.1508523	total: 21.3s	remaining: 107ms
995:	learn: 0.1508160	total: 21.4s	remaining: 85.7ms
996:	learn: 0.1508060	total: 21.4s	remaining: 64.3ms
997:	learn: 0.1507809	total: 21.4s	remaining: 42.9ms
998:	learn: 0.1507601	total: 21.4s	remaining: 21.4ms
999:	learn: 0.1507159	total: 21.4s	remaining: 0us
[0 0 0 ...

In [13]:
print(accuracy_score(y_test,preds))
print(accuracy_score(y_train,preds_training))

0.906644963615473
0.9438287034853559


In [14]:
# IMPORTING PREDICTION DATA

test = pd.read_csv('test_data.csv')MultiClass, MultiClassOneVsAll or custom objective object


SyntaxError: invalid syntax (<ipython-input-14-b7e083a80a9f>, line 3)

In [None]:
test

In [None]:
final_train_X = train[train_X_variables]
final_train_y = train[train_y_variables]
final_test_X = test[train_X_variables]

In [None]:
# Getting Catagorical Columns

s = (final_train_X.dtypes == 'object')
object_cols = list(s[s].index)

# Getting Numerical Columns

s = (final_train_X.dtypes != 'object')
num_cols = list(s[s].index)

#printing to check

print(object_cols)
print(num_cols)

In [None]:
# Numeric Transformer

numerical_transformer = SimpleImputer(strategy='mean')

# Catagorical Transformer

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, object_cols)
    ])


In [None]:
# APPLYING PIPELINE

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model_cat)
                             ])
my_pipeline.fit(final_train_X, final_train_y)
preds = my_pipeline.predict(final_test_X)

