In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 필요 모듈 임포트

In [5]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import f1_score, classification_report

# oversampling
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN


# models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier


# 데이터 확인

In [6]:
df_org = pd.read_csv('/content/drive/MyDrive/train.csv')
df_org.sample(10)

Unnamed: 0,id,age,job,marital,education,default,housing,loan,contact,month,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
15493,21763,32,admin.,divorced,university.degree,no,yes,no,cellular,aug,...,4,999,0,nonexistent,1.4,93.444,-36.1,4.964,5228.1,no
13375,15545,41,housemaid,married,high.school,no,yes,no,cellular,jul,...,4,999,0,nonexistent,1.4,93.918,-42.7,4.957,5228.1,no
15674,4034,32,admin.,married,high.school,unknown,yes,no,telephone,may,...,3,999,0,nonexistent,1.1,93.994,-36.4,4.858,5191.0,no
11733,12849,26,entrepreneur,married,professional.course,no,yes,no,cellular,jul,...,1,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,no
13863,24735,43,admin.,married,high.school,no,no,no,cellular,nov,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.153,5195.8,no
12085,1704,36,blue-collar,single,basic.4y,no,no,no,telephone,may,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
26808,7542,31,services,single,high.school,no,no,no,telephone,may,...,3,999,0,nonexistent,1.1,93.994,-36.4,4.864,5191.0,no
21084,19806,36,blue-collar,married,professional.course,no,no,no,cellular,aug,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.966,5228.1,no
15582,15921,22,blue-collar,single,unknown,unknown,unknown,unknown,cellular,jul,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.96,5228.1,no
8643,21822,31,admin.,single,university.degree,no,no,no,cellular,aug,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.964,5228.1,no


In [7]:
df_org.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32950 entries, 0 to 32949
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              32950 non-null  int64  
 1   age             32950 non-null  int64  
 2   job             32950 non-null  object 
 3   marital         32950 non-null  object 
 4   education       32950 non-null  object 
 5   default         32950 non-null  object 
 6   housing         32950 non-null  object 
 7   loan            32950 non-null  object 
 8   contact         32950 non-null  object 
 9   month           32950 non-null  object 
 10  day_of_week     32950 non-null  object 
 11  campaign        32950 non-null  int64  
 12  pdays           32950 non-null  int64  
 13  previous        32950 non-null  int64  
 14  poutcome        32950 non-null  object 
 15  emp.var.rate    32950 non-null  float64
 16  cons.price.idx  32950 non-null  float64
 17  cons.conf.idx   32950 non-null 

# Preprocessing

In [8]:
df = df_org

## Handling Missing values

In [9]:
# change 'unknown' to nan
df.replace('unknown', np.nan, inplace=True)

# delete records containing nan values
df = df.dropna()

df.isna().sum()

id                0
age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

## Drop columns (1)

In [10]:
df.columns

Index(['id', 'age', 'job', 'marital', 'education', 'default', 'housing',
       'loan', 'contact', 'month', 'day_of_week', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [11]:
cols = ['age', 'marital', 'job','education', 'default',
       'contact', 'month','campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx','euribor3m','y']

In [12]:
df = df[cols]
df.columns

Index(['age', 'marital', 'job', 'education', 'default', 'contact', 'month',
       'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx', 'euribor3m', 'y'],
      dtype='object')

## Encoding

In [13]:
y_map = {
    'no':0,
    'yes':1
}

df['y'] = df['y'].map(y_map)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24383 entries, 1 to 32949
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             24383 non-null  int64  
 1   marital         24383 non-null  object 
 2   job             24383 non-null  object 
 3   education       24383 non-null  object 
 4   default         24383 non-null  object 
 5   contact         24383 non-null  object 
 6   month           24383 non-null  object 
 7   campaign        24383 non-null  int64  
 8   pdays           24383 non-null  int64  
 9   previous        24383 non-null  int64  
 10  poutcome        24383 non-null  object 
 11  emp.var.rate    24383 non-null  float64
 12  cons.price.idx  24383 non-null  float64
 13  euribor3m       24383 non-null  float64
 14  y               24383 non-null  int64  
dtypes: float64(3), int64(5), object(7)
memory usage: 3.0+ MB


In [15]:
obj = ['marital', 'job','education', 'default','contact','month','poutcome']

In [16]:
df = pd.get_dummies(df, columns=obj)
df = pd.DataFrame(df)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24383 entries, 1 to 32949
Data columns (total 46 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            24383 non-null  int64  
 1   campaign                       24383 non-null  int64  
 2   pdays                          24383 non-null  int64  
 3   previous                       24383 non-null  int64  
 4   emp.var.rate                   24383 non-null  float64
 5   cons.price.idx                 24383 non-null  float64
 6   euribor3m                      24383 non-null  float64
 7   y                              24383 non-null  int64  
 8   marital_divorced               24383 non-null  bool   
 9   marital_married                24383 non-null  bool   
 10  marital_single                 24383 non-null  bool   
 11  job_admin.                     24383 non-null  bool   
 12  job_blue-collar                24383 non-null  bool

## Drop Columns (2)

In [18]:
df.columns

Index(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'euribor3m', 'y', 'marital_divorced',
       'marital_married', 'marital_single', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'default_no', 'default_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'poutcome_failure', 'poutcome_nonexistent', 'poutcome_success'],
      dtype='object')

In [19]:
cols = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'euribor3m', 'y', 'marital_single','job_retired','job_student',
       'education_illiterate', 'education_university.degree',
       'default_no', 'contact_cellular',
       'month_dec', 'month_mar','month_oct', 'month_sep', 'poutcome_success']

In [20]:
df = df[cols]
df.columns

Index(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'euribor3m', 'y', 'marital_single', 'job_retired',
       'job_student', 'education_illiterate', 'education_university.degree',
       'default_no', 'contact_cellular', 'month_dec', 'month_mar', 'month_oct',
       'month_sep', 'poutcome_success'],
      dtype='object')

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24383 entries, 1 to 32949
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   age                          24383 non-null  int64  
 1   campaign                     24383 non-null  int64  
 2   pdays                        24383 non-null  int64  
 3   previous                     24383 non-null  int64  
 4   emp.var.rate                 24383 non-null  float64
 5   cons.price.idx               24383 non-null  float64
 6   euribor3m                    24383 non-null  float64
 7   y                            24383 non-null  int64  
 8   marital_single               24383 non-null  bool   
 9   job_retired                  24383 non-null  bool   
 10  job_student                  24383 non-null  bool   
 11  education_illiterate         24383 non-null  bool   
 12  education_university.degree  24383 non-null  bool   
 13  default_no           

## Scaling

In [22]:
df['pdays'] = df['pdays'].replace(999,-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pdays'] = df['pdays'].replace(999,-1)


In [23]:
numericals = ['age', 'campaign', 'pdays',	'previous',	'emp.var.rate',	'cons.price.idx',	'euribor3m']

In [24]:
ss = StandardScaler()
df[numericals] = ss.fit_transform(df[numericals])
df = pd.DataFrame(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numericals] = ss.fit_transform(df[numericals])


In [25]:
df.describe()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,euribor3m,y
count,24383.0,24383.0,24383.0,24383.0,24383.0,24383.0,24383.0,24383.0
mean,3.287094e-16,-3.9340220000000005e-17,6.010312000000001e-17,-1.1947770000000001e-17,-2.4915480000000003e-17,6.8248e-16,5.5367720000000004e-17,0.126071
std,1.000021,1.000021,1.000021,1.000021,1.000021,1.000021,1.000021,0.331937
min,-2.138719,-0.5616367,-0.1851924,-0.3692239,-2.068215,-2.261521,-1.592084,0.0
25%,-0.7791658,-0.5616367,-0.1851924,-0.3692239,-1.075061,-0.7683522,-1.210008,0.0
50%,-0.1965,-0.1909972,-0.1851924,-0.3692239,0.7250287,-0.1379412,0.7836488,0.0
75%,0.5803876,0.1796423,-0.1851924,-0.3692239,0.911245,0.8016958,0.8427327,0.0
max,5.435935,14.63458,17.05912,12.88688,0.911245,2.122313,0.8899997,1.0


## Splitting Data & Oversampling

In [26]:
df.head()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,euribor3m,y,marital_single,job_retired,job_student,education_illiterate,education_university.degree,default_no,contact_cellular,month_dec,month_mar,month_oct,month_sep,poutcome_success
1,-0.682055,-0.561637,-0.185192,-0.369224,-1.757854,-0.959696,-1.238706,1,False,False,False,False,True,True,True,False,False,False,False,False
2,-0.293611,0.920921,-0.185192,-0.369224,0.911245,-0.137941,0.844421,0,True,False,False,False,True,True,True,False,False,False,False,False
3,-0.876277,-0.561637,-0.185192,3.418233,-1.075061,-1.079287,-1.217886,0,True,False,False,False,False,True,True,False,False,False,False,False
4,3.590827,-0.561637,-0.185192,1.524505,-1.012989,0.858074,-1.450283,1,False,True,False,False,False,True,True,False,False,False,False,False
5,1.551497,-0.561637,-0.185192,-0.369224,-2.068215,-1.868582,-1.533,0,False,False,False,False,False,True,True,False,False,True,False,False


In [27]:
X = df.drop('y', axis=1)
y = df['y']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [29]:
smoteenn = SMOTEENN(random_state=12)
X_smoteenn, y_smoteenn = smoteenn.fit_resample(X_train, y_train)

In [30]:
X_smoteenn.describe()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,euribor3m
count,21649.0,21649.0,21649.0,21649.0,21649.0,21649.0,21649.0
mean,0.098387,-0.075588,0.385561,0.326989,-0.349868,-0.089825,-0.372013
std,1.192228,0.907459,1.623042,1.413572,1.058628,1.074218,1.063843
min,-2.138719,-0.561637,-0.185192,-0.369224,-2.068215,-2.261521,-1.592084
25%,-0.779166,-0.561637,-0.185192,-0.369224,-1.075061,-1.002005,-1.425979
50%,-0.228378,-0.263294,-0.185192,-0.369224,-0.738538,-0.137941,-1.153073
75%,0.77461,0.066243,-0.185192,1.524505,0.911245,0.801696,0.828006
max,4.85327,14.634583,17.059121,10.993148,0.911245,2.122313,0.89


# Training

## RandomForest

In [31]:
rf = RandomForestClassifier(n_estimators=300,
                            min_samples_split=5,
                            random_state=12,
                            class_weight=None,
                            criterion='entropy',
                            max_features=None)

rf.fit(X_smoteenn, y_smoteenn)
pred_rf = rf.predict(X_test)
f1_rf = f1_score(y_test, pred_rf)

print(f1_rf)
print(classification_report(y_test, pred_rf))

0.4976958525345622
              precision    recall  f1-score   support

           0       0.94      0.89      0.91      6394
           1       0.43      0.59      0.50       921

    accuracy                           0.85      7315
   macro avg       0.68      0.74      0.71      7315
weighted avg       0.87      0.85      0.86      7315



In [32]:
values, counts = np.unique(pred_rf, return_counts=True)

for value, count in zip(values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0, Count: 6066
Value: 1, Count: 1249


## GradientBoosting

In [33]:
gb = GradientBoostingClassifier(random_state=0,
                                 n_estimators= 92,
                                 learning_rate= 0.18,
                                 max_depth= 5,
                                 subsample= 0.84,
                                 min_weight_fraction_leaf=0)
gb.fit(X_smoteenn, y_smoteenn)
pred_gb = gb.predict(X_test)
f1_gb = f1_score(y_test, pred_gb)

print(f1_gb)
print(classification_report(y_test, pred_gb))

0.5119774542038515
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      6394
           1       0.45      0.59      0.51       921

    accuracy                           0.86      7315
   macro avg       0.69      0.74      0.71      7315
weighted avg       0.88      0.86      0.87      7315



In [34]:
values, counts = np.unique(pred_gb, return_counts=True)

for value, count in zip(values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0, Count: 6107
Value: 1, Count: 1208


## Adaboost

In [35]:
ab = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=6),
    n_estimators=100,
    learning_rate=0.1
)
ab.fit(X_smoteenn, y_smoteenn)
pred_ab = ab.predict(X_test)
f1_ab = f1_score(y_test, pred_ab)

print(f1_ab)
print(classification_report(y_test, pred_ab))

0.5055475156777617
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      6394
           1       0.45      0.57      0.51       921

    accuracy                           0.86      7315
   macro avg       0.70      0.74      0.71      7315
weighted avg       0.88      0.86      0.87      7315



In [36]:
values, counts = np.unique(pred_ab, return_counts=True)

for value, count in zip(values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0, Count: 6163
Value: 1, Count: 1152


## LightGBM

In [37]:
lg = LGBMClassifier(n_estimators=300, num_leaves=30, learning_rate=0.05,
                    reg_alpha=0.6, reg_lambda=0.6,
                    subsample=0.8, colsample_bytree=0.8,
                    verbose=0)
lg.fit(X_smoteenn, y_smoteenn)
pred_lg = lg.predict(X_test)
f1_lg = f1_score(y_test, pred_lg)

print(f1_lg)
print(classification_report(y_test, pred_lg))

0.5061611374407583
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      6394
           1       0.45      0.58      0.51       921

    accuracy                           0.86      7315
   macro avg       0.69      0.74      0.71      7315
weighted avg       0.88      0.86      0.87      7315



In [38]:
values, counts = np.unique(pred_lg, return_counts=True)

for value, count in zip(values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0, Count: 6126
Value: 1, Count: 1189


# Bagging / Voting

In [39]:
voting_clf = VotingClassifier(estimators=[
    ('rf', rf),
    ('gb', gb),
    ('ab', ab),
    ('lgbm',lg)
], voting='soft', weights=[2,3,3,3])

In [40]:
voting_clf.fit(X_smoteenn, y_smoteenn)
soft_pred = voting_clf.predict(X_test)

In [41]:
f1_soft = f1_score(y_test, soft_pred)

print(f1_soft)
print(classification_report(y_test, soft_pred))

0.5164938737040529
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      6394
           1       0.46      0.60      0.52       921

    accuracy                           0.86      7315
   macro avg       0.70      0.75      0.72      7315
weighted avg       0.88      0.86      0.87      7315



In [42]:
values, counts = np.unique(soft_pred, return_counts=True)

for value, count in zip(values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0, Count: 6114
Value: 1, Count: 1201


# Validation

In [43]:
scores = cross_val_score(voting_clf, X, y, cv=5)

print(scores)

[0.88661062 0.89419725 0.89112159 0.88720263 0.88474159]


In [44]:
print(scores.mean())

0.8887747362811798


# Preprocessing - test.csv

In [46]:
df_test = pd.read_csv('/content/drive/MyDrive/test_원본.csv')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8238 entries, 0 to 8237
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              8238 non-null   int64  
 1   age             8238 non-null   int64  
 2   job             8238 non-null   object 
 3   marital         8238 non-null   object 
 4   education       8238 non-null   object 
 5   default         8238 non-null   object 
 6   housing         8238 non-null   object 
 7   loan            8238 non-null   object 
 8   contact         8238 non-null   object 
 9   month           8238 non-null   object 
 10  day_of_week     8238 non-null   object 
 11  campaign        8238 non-null   int64  
 12  pdays           8238 non-null   int64  
 13  previous        8238 non-null   int64  
 14  poutcome        8238 non-null   object 
 15  emp.var.rate    8238 non-null   float64
 16  cons.price.idx  8238 non-null   float64
 17  cons.conf.idx   8238 non-null   f

In [47]:
df_test.columns

Index(['id', 'age', 'job', 'marital', 'education', 'default', 'housing',
       'loan', 'contact', 'month', 'day_of_week', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')

In [48]:
df_test=df_test.replace('unknown',np.nan)

In [49]:
df_test.isna().sum()

id                   0
age                  0
job                 73
marital             18
education          343
default           1744
housing            191
loan               191
contact              0
month                0
day_of_week          0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
dtype: int64

In [50]:
# column selection (1)
cols = ['age', 'marital', 'job','education', 'default',
       'contact', 'month','campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx','euribor3m']

In [51]:
df_test = df_test[cols]
df_test.columns


Index(['age', 'marital', 'job', 'education', 'default', 'contact', 'month',
       'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx', 'euribor3m'],
      dtype='object')

In [52]:
# Scaling
df_test['pdays'] = df_test['pdays'].replace(999,-1)


In [53]:
numericals = ['age', 'campaign', 'pdays',	'previous',
              'emp.var.rate',	'cons.price.idx',	'euribor3m']

In [54]:
df_test[numericals] = ss.transform(df_test[numericals])
df_test.describe()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,euribor3m
count,8238.0,8238.0,8238.0,8238.0,8238.0,8238.0,8238.0
mean,0.097308,0.019023,-0.030366,-0.040729,0.089977,0.080537,0.086116
std,1.019619,1.031204,0.915317,0.91265,0.972714,0.989731,0.975259
min,-2.138719,-0.561637,-0.185192,-0.369224,-2.068215,-2.261521,-1.591521
25%,-0.682055,-0.561637,-0.185192,-0.369224,-1.075061,-0.768352,-1.198192
50%,-0.099389,-0.190997,-0.185192,-0.369224,0.725029,-0.137941,0.784212
75%,0.77461,0.179642,-0.185192,-0.369224,0.911245,0.801696,0.842733
max,5.338824,15.005222,15.827384,9.099419,0.911245,2.122313,0.89


In [55]:
obj = ['marital', 'job','education', 'default','contact','month','poutcome']

In [56]:
df_test = pd.get_dummies(df_test, columns=obj)
df_test = pd.DataFrame(df_test)


In [57]:
# column selection (2)
df_test.columns


Index(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'euribor3m', 'marital_divorced', 'marital_married',
       'marital_single', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'education_basic.4y', 'education_basic.6y', 'education_basic.9y',
       'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'default_no', 'contact_cellular', 'contact_telephone', 'month_apr',
       'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep', 'poutcome_failure',
       'poutcome_nonexistent', 'poutcome_success'],
      dtype='object')

In [58]:
cols = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'euribor3m', 'marital_single','job_retired','job_student',
       'education_illiterate', 'education_university.degree',
       'default_no', 'contact_cellular',
       'month_dec', 'month_mar','month_oct', 'month_sep', 'poutcome_success']

In [59]:
df_test = df_test[cols]
df_test.columns


Index(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'euribor3m', 'marital_single', 'job_retired',
       'job_student', 'education_illiterate', 'education_university.degree',
       'default_no', 'contact_cellular', 'month_dec', 'month_mar', 'month_oct',
       'month_sep', 'poutcome_success'],
      dtype='object')

In [60]:
df_test.describe()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,euribor3m
count,8238.0,8238.0,8238.0,8238.0,8238.0,8238.0,8238.0
mean,0.097308,0.019023,-0.030366,-0.040729,0.089977,0.080537,0.086116
std,1.019619,1.031204,0.915317,0.91265,0.972714,0.989731,0.975259
min,-2.138719,-0.561637,-0.185192,-0.369224,-2.068215,-2.261521,-1.591521
25%,-0.682055,-0.561637,-0.185192,-0.369224,-1.075061,-0.768352,-1.198192
50%,-0.099389,-0.190997,-0.185192,-0.369224,0.725029,-0.137941,0.784212
75%,0.77461,0.179642,-0.185192,-0.369224,0.911245,0.801696,0.842733
max,5.338824,15.005222,15.827384,9.099419,0.911245,2.122313,0.89


# Training - test.csv

In [66]:
final_pred = voting_clf.predict(df_test)


In [77]:
values, counts = np.unique(final_pred, return_counts=True)

for value, count in zip(values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0, Count: 7121
Value: 1, Count: 1117


# Submit

In [68]:
df_submit = pd.read_csv('/content/drive/MyDrive/test.csv')
df_submit['y'] = final_pred

In [69]:
df_submit['y']

0       0
1       0
2       0
3       0
4       0
       ..
8233    0
8234    0
8235    0
8236    0
8237    1
Name: y, Length: 8238, dtype: int64

In [72]:
df_submit.groupby('y').size()

y
0    7121
1    1117
dtype: int64

In [76]:
df_submit.to_csv('/content/drive/MyDrive/example2.csv', index=False)