In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 필요 모듈 임포트

In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import f1_score, classification_report

# oversampling
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN


# models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier


# 데이터 확인

In [3]:
df_org = pd.read_csv('/content/drive/MyDrive/패턴인식/Pr_project/train.csv')
df_org.sample(10)

Unnamed: 0,id,age,job,marital,education,default,housing,loan,contact,month,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
5727,11223,37,technician,single,high.school,no,no,no,telephone,jun,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.961,5228.1,no
2823,26191,45,technician,divorced,professional.course,no,no,no,cellular,nov,...,2,999,0,nonexistent,-0.1,93.2,-42.0,4.076,5195.8,no
15923,6498,27,admin.,single,university.degree,no,yes,no,telephone,may,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
10533,29045,38,technician,single,high.school,no,yes,no,cellular,apr,...,1,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,no
12296,34083,36,admin.,single,university.degree,no,yes,no,cellular,may,...,3,999,0,nonexistent,-1.8,92.893,-46.2,1.281,5099.1,no
15257,13552,35,admin.,single,university.degree,unknown,no,no,cellular,jul,...,1,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,yes
31940,4455,31,blue-collar,unknown,unknown,unknown,no,no,telephone,may,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.856,5191.0,no
29507,11025,26,technician,single,professional.course,no,no,no,telephone,jun,...,5,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
23074,17040,48,admin.,single,high.school,no,no,no,cellular,jul,...,3,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,no
25285,5569,45,housemaid,married,professional.course,unknown,no,no,telephone,may,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
df_org.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32950 entries, 0 to 32949
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              32950 non-null  int64  
 1   age             32950 non-null  int64  
 2   job             32950 non-null  object 
 3   marital         32950 non-null  object 
 4   education       32950 non-null  object 
 5   default         32950 non-null  object 
 6   housing         32950 non-null  object 
 7   loan            32950 non-null  object 
 8   contact         32950 non-null  object 
 9   month           32950 non-null  object 
 10  day_of_week     32950 non-null  object 
 11  campaign        32950 non-null  int64  
 12  pdays           32950 non-null  int64  
 13  previous        32950 non-null  int64  
 14  poutcome        32950 non-null  object 
 15  emp.var.rate    32950 non-null  float64
 16  cons.price.idx  32950 non-null  float64
 17  cons.conf.idx   32950 non-null 

# Preprocessing

In [5]:
df = df_org

## Drop columns (1)

In [6]:
df.columns

Index(['id', 'age', 'job', 'marital', 'education', 'default', 'housing',
       'loan', 'contact', 'month', 'day_of_week', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [7]:
cols = ['age', 'marital', 'job','education', 'default',
       'contact', 'month','campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx','euribor3m','y']

In [8]:
df = df[cols]
df.columns

Index(['age', 'marital', 'job', 'education', 'default', 'contact', 'month',
       'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx', 'euribor3m', 'y'],
      dtype='object')

## Encoding

In [9]:
y_map = {
    'no':0,
    'yes':1
}

df['y'] = df['y'].map(y_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['y'] = df['y'].map(y_map)


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32950 entries, 0 to 32949
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32950 non-null  int64  
 1   marital         32950 non-null  object 
 2   job             32950 non-null  object 
 3   education       32950 non-null  object 
 4   default         32950 non-null  object 
 5   contact         32950 non-null  object 
 6   month           32950 non-null  object 
 7   campaign        32950 non-null  int64  
 8   pdays           32950 non-null  int64  
 9   previous        32950 non-null  int64  
 10  poutcome        32950 non-null  object 
 11  emp.var.rate    32950 non-null  float64
 12  cons.price.idx  32950 non-null  float64
 13  euribor3m       32950 non-null  float64
 14  y               32950 non-null  int64  
dtypes: float64(3), int64(5), object(7)
memory usage: 3.8+ MB


In [11]:
obj = ['marital', 'job','education', 'default','contact','month','poutcome']

In [12]:
df = pd.get_dummies(df, columns=obj)
df = pd.DataFrame(df)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32950 entries, 0 to 32949
Data columns (total 50 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            32950 non-null  int64  
 1   campaign                       32950 non-null  int64  
 2   pdays                          32950 non-null  int64  
 3   previous                       32950 non-null  int64  
 4   emp.var.rate                   32950 non-null  float64
 5   cons.price.idx                 32950 non-null  float64
 6   euribor3m                      32950 non-null  float64
 7   y                              32950 non-null  int64  
 8   marital_divorced               32950 non-null  bool   
 9   marital_married                32950 non-null  bool   
 10  marital_single                 32950 non-null  bool   
 11  marital_unknown                32950 non-null  bool   
 12  job_admin.                     32950 non-null 

## Drop Columns (2)

In [14]:
df.columns

Index(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'euribor3m', 'y', 'marital_divorced',
       'marital_married', 'marital_single', 'marital_unknown', 'job_admin.',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'education_basic.4y', 'education_basic.6y', 'education_basic.9y',
       'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'contact_cellular', 'contact_telephone', 'month_apr', 'month_aug',
       'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may',
       'month_nov', 'month_oct', 'month_sep', 'poutcome_failure',
       'poutcome_nonexistent', 'poutcome_success'],
      dtype='object')

In [15]:
cols = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'euribor3m', 'y', 'marital_single','job_retired','job_student',
       'education_illiterate', 'education_university.degree',
       'default_no', 'contact_cellular',
       'month_dec', 'month_mar','month_oct', 'month_sep', 'poutcome_success']

In [16]:
df = df[cols]
df.columns

Index(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'euribor3m', 'y', 'marital_single', 'job_retired',
       'job_student', 'education_illiterate', 'education_university.degree',
       'default_no', 'contact_cellular', 'month_dec', 'month_mar', 'month_oct',
       'month_sep', 'poutcome_success'],
      dtype='object')

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32950 entries, 0 to 32949
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   age                          32950 non-null  int64  
 1   campaign                     32950 non-null  int64  
 2   pdays                        32950 non-null  int64  
 3   previous                     32950 non-null  int64  
 4   emp.var.rate                 32950 non-null  float64
 5   cons.price.idx               32950 non-null  float64
 6   euribor3m                    32950 non-null  float64
 7   y                            32950 non-null  int64  
 8   marital_single               32950 non-null  bool   
 9   job_retired                  32950 non-null  bool   
 10  job_student                  32950 non-null  bool   
 11  education_illiterate         32950 non-null  bool   
 12  education_university.degree  32950 non-null  bool   
 13  default_no      

## Scaling

In [18]:
df['pdays'] = df['pdays'].replace(999,-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pdays'] = df['pdays'].replace(999,-1)


In [19]:
numericals = ['age', 'campaign', 'pdays',	'previous',	'emp.var.rate',	'cons.price.idx',	'euribor3m']

In [20]:
mm = MinMaxScaler()

df[numericals] = mm.fit_transform(df[numericals])

In [21]:
df.describe()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,euribor3m,y
count,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0
mean,0.284243,0.028506,0.009274,0.024691,0.725652,0.536091,0.677514,0.112656
std,0.128417,0.050309,0.054154,0.071157,0.32749,0.225536,0.393288,0.316176
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.185185,0.0,0.0,0.0,0.333333,0.340608,0.160961,0.0
50%,0.259259,0.018182,0.0,0.0,0.9375,0.603274,0.957379,0.0
75%,0.37037,0.036364,0.0,0.0,1.0,0.698753,0.980957,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Splitting Data & Oversampling

In [22]:
X = df.drop('y', axis=1)
y = df['y']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [24]:
smoteenn = SMOTEENN(random_state=12)
X_smoteenn, y_smoteenn = smoteenn.fit_resample(X_train, y_train)

In [25]:
X_smoteenn.describe()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,euribor3m
count,28925.0,28925.0,28925.0,28925.0,28925.0,28925.0,28925.0
mean,0.294151,0.023728,0.033604,0.049634,0.600065,0.509311,0.517745
std,0.152361,0.043692,0.093066,0.101534,0.355981,0.243895,0.430807
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.185006,0.0,0.0,0.0,0.333333,0.29696,0.081387
50%,0.259259,0.014382,0.0,0.0,0.479167,0.484412,0.24694
75%,0.382716,0.031465,0.0,0.025836,1.0,0.698753,0.98005
max,1.0,0.763636,1.0,0.857143,1.0,1.0,1.0


# Training

## RandomForest

In [26]:
rf = RandomForestClassifier(n_estimators=300,
                            min_samples_split=5,
                            random_state=12,
                            class_weight=None,
                            criterion='entropy',
                            max_features=None)

rf.fit(X_smoteenn, y_smoteenn)
pred_rf = rf.predict(X_test)
f1_rf = f1_score(y_test, pred_rf)

print(f1_rf)
print(classification_report(y_test, pred_rf))

0.4621643612693246
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      8835
           1       0.40      0.54      0.46      1050

    accuracy                           0.87      9885
   macro avg       0.67      0.72      0.69      9885
weighted avg       0.89      0.87      0.87      9885



In [27]:
values, counts = np.unique(pred_rf, return_counts=True)

for value, count in zip(values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0, Count: 8477
Value: 1, Count: 1408


## GradientBoosting

In [28]:
gb = GradientBoostingClassifier(random_state=0,
                                 n_estimators= 92,
                                 learning_rate= 0.18,
                                 max_depth= 5,
                                 subsample= 0.84,
                                 min_weight_fraction_leaf=0)
gb.fit(X_smoteenn, y_smoteenn)
pred_gb = gb.predict(X_test)
f1_gb = f1_score(y_test, pred_gb)

print(f1_gb)
print(classification_report(y_test, pred_gb))

0.4828711256117455
              precision    recall  f1-score   support

           0       0.95      0.91      0.93      8835
           1       0.42      0.56      0.48      1050

    accuracy                           0.87      9885
   macro avg       0.68      0.74      0.70      9885
weighted avg       0.89      0.87      0.88      9885



In [29]:
values, counts = np.unique(pred_gb, return_counts=True)

for value, count in zip(values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0, Count: 8483
Value: 1, Count: 1402


## Adaboost

In [30]:
ab = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=6),
    n_estimators=100,
    learning_rate=0.1
)
ab.fit(X_smoteenn, y_smoteenn)
pred_ab = ab.predict(X_test)
f1_ab = f1_score(y_test, pred_ab)

print(f1_ab)
print(classification_report(y_test, pred_ab))

0.4799666110183639
              precision    recall  f1-score   support

           0       0.94      0.91      0.93      8835
           1       0.43      0.55      0.48      1050

    accuracy                           0.87      9885
   macro avg       0.69      0.73      0.70      9885
weighted avg       0.89      0.87      0.88      9885



In [31]:
values, counts = np.unique(pred_ab, return_counts=True)

for value, count in zip(values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0, Count: 8539
Value: 1, Count: 1346


## LightGBM

In [32]:
lg = LGBMClassifier(n_estimators=300, num_leaves=30, learning_rate=0.05,
                    reg_alpha=0.6, reg_lambda=0.6,
                    subsample=0.8, colsample_bytree=0.8,
                    verbose=0)
lg.fit(X_smoteenn, y_smoteenn)
pred_lg = lg.predict(X_test)
f1_lg = f1_score(y_test, pred_lg)

print(f1_lg)
print(classification_report(y_test, pred_lg))

0.4835895305359369
              precision    recall  f1-score   support

           0       0.95      0.91      0.93      8835
           1       0.43      0.55      0.48      1050

    accuracy                           0.87      9885
   macro avg       0.69      0.73      0.71      9885
weighted avg       0.89      0.87      0.88      9885



In [33]:
values, counts = np.unique(pred_lg, return_counts=True)

for value, count in zip(values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0, Count: 8528
Value: 1, Count: 1357


# Bagging / Voting

In [34]:
voting_clf = VotingClassifier(estimators=[
    ('rf', rf),
    ('gb', gb),
    ('ab', ab),
    ('lgbm',lg)
], voting='soft', weights=[2,3,3,3])

In [35]:
voting_clf.fit(X_smoteenn, y_smoteenn)
soft_pred = voting_clf.predict(X_test)

In [36]:
f1_soft = f1_score(y_test, soft_pred)

print(f1_soft)
print(classification_report(y_test, soft_pred))

0.4849979449239622
              precision    recall  f1-score   support

           0       0.95      0.91      0.93      8835
           1       0.43      0.56      0.48      1050

    accuracy                           0.87      9885
   macro avg       0.69      0.74      0.71      9885
weighted avg       0.89      0.87      0.88      9885



In [37]:
values, counts = np.unique(soft_pred, return_counts=True)

for value, count in zip(values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0, Count: 8502
Value: 1, Count: 1383


# Validation

In [38]:
scores = cross_val_score(voting_clf, X, y, cv=5)

print(scores)

[0.89939302 0.90242792 0.89848255 0.8983308  0.89711684]


In [39]:
print(scores.mean())

0.8991502276176024


# Preprocessing - test.csv

In [57]:
df_test = pd.read_csv('/content/drive/MyDrive/패턴인식/Pr_project/test.csv')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8238 entries, 0 to 8237
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              8238 non-null   int64  
 1   age             8238 non-null   int64  
 2   job             8238 non-null   object 
 3   marital         8238 non-null   object 
 4   education       8238 non-null   object 
 5   default         8238 non-null   object 
 6   housing         8238 non-null   object 
 7   loan            8238 non-null   object 
 8   contact         8238 non-null   object 
 9   month           8238 non-null   object 
 10  day_of_week     8238 non-null   object 
 11  campaign        8238 non-null   int64  
 12  pdays           8238 non-null   int64  
 13  previous        8238 non-null   int64  
 14  poutcome        8238 non-null   object 
 15  emp.var.rate    8238 non-null   float64
 16  cons.price.idx  8238 non-null   float64
 17  cons.conf.idx   8238 non-null   f

In [58]:
df_test.columns

Index(['id', 'age', 'job', 'marital', 'education', 'default', 'housing',
       'loan', 'contact', 'month', 'day_of_week', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')

In [59]:
# column selection (1)
cols = ['age', 'marital', 'job','education', 'default',
       'contact', 'month','campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx','euribor3m']

In [60]:
df_test = df_test[cols]
df_test.columns


Index(['age', 'marital', 'job', 'education', 'default', 'contact', 'month',
       'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx', 'euribor3m'],
      dtype='object')

In [61]:
# Scaling
df_test['pdays'] = df_test['pdays'].replace(999,-1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['pdays'] = df_test['pdays'].replace(999,-1)


In [62]:
# scaling - StandardScaler
numericals = ['age', 'campaign', 'pdays',	'previous',
              'emp.var.rate',	'cons.price.idx',	'euribor3m']

In [63]:
df_test[numericals] = mm.transform(df_test[numericals])
df_test.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[numericals] = mm.transform(df_test[numericals])


Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,euribor3m
count,8238.0,8238.0,8238.0,8238.0,8238.0,8238.0,8238.0
mean,0.284265,0.028484,0.008978,0.024781,0.724357,0.534249,0.676126
std,0.129624,0.050586,0.053079,0.068848,0.326473,0.225768,0.39292
min,0.0,0.0,0.0,0.0,0.0,0.0,0.000227
25%,0.185185,0.0,0.0,0.0,0.333333,0.340608,0.158694
50%,0.259259,0.018182,0.0,0.0,0.9375,0.484412,0.957379
75%,0.37037,0.036364,0.0,0.0,1.0,0.698753,0.980957
max,0.950617,0.763636,0.928571,0.714286,1.0,1.0,1.0


In [64]:
obj = ['marital', 'job','education', 'default','contact','month','poutcome']

In [65]:
df_test = pd.get_dummies(df_test, columns=obj)
df_test = pd.DataFrame(df_test)


In [66]:
# column selection (2)
df_test.columns


Index(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'euribor3m', 'marital_divorced', 'marital_married',
       'marital_single', 'marital_unknown', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'education_basic.4y',
       'education_basic.6y', 'education_basic.9y', 'education_high.school',
       'education_illiterate', 'education_professional.course',
       'education_university.degree', 'education_unknown', 'default_no',
       'default_unknown', 'contact_cellular', 'contact_telephone', 'month_apr',
       'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep', 'poutcome_failure',
       'poutcome_nonexistent', 'poutcome_success'],
      dtype='object')

In [67]:
cols = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'euribor3m', 'marital_single','job_retired','job_student',
       'education_illiterate', 'education_university.degree',
       'default_no', 'contact_cellular',
       'month_dec', 'month_mar','month_oct', 'month_sep', 'poutcome_success']

In [68]:
df_test = df_test[cols]
df_test.columns


Index(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'euribor3m', 'marital_single', 'job_retired',
       'job_student', 'education_illiterate', 'education_university.degree',
       'default_no', 'contact_cellular', 'month_dec', 'month_mar', 'month_oct',
       'month_sep', 'poutcome_success'],
      dtype='object')

In [69]:
df_test.describe()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,euribor3m
count,8238.0,8238.0,8238.0,8238.0,8238.0,8238.0,8238.0
mean,0.284265,0.028484,0.008978,0.024781,0.724357,0.534249,0.676126
std,0.129624,0.050586,0.053079,0.068848,0.326473,0.225768,0.39292
min,0.0,0.0,0.0,0.0,0.0,0.0,0.000227
25%,0.185185,0.0,0.0,0.0,0.333333,0.340608,0.158694
50%,0.259259,0.018182,0.0,0.0,0.9375,0.484412,0.957379
75%,0.37037,0.036364,0.0,0.0,1.0,0.698753,0.980957
max,0.950617,0.763636,0.928571,0.714286,1.0,1.0,1.0


# Training - test.csv

In [77]:
df_test.describe()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,euribor3m
count,8238.0,8238.0,8238.0,8238.0,8238.0,8238.0,8238.0
mean,0.284265,0.028484,0.008978,0.024781,0.724357,0.534249,0.676126
std,0.129624,0.050586,0.053079,0.068848,0.326473,0.225768,0.39292
min,0.0,0.0,0.0,0.0,0.0,0.0,0.000227
25%,0.185185,0.0,0.0,0.0,0.333333,0.340608,0.158694
50%,0.259259,0.018182,0.0,0.0,0.9375,0.484412,0.957379
75%,0.37037,0.036364,0.0,0.0,1.0,0.698753,0.980957
max,0.950617,0.763636,0.928571,0.714286,1.0,1.0,1.0


In [70]:
test_pred = voting_clf.predict(df_test)


In [71]:
values, counts = np.unique(test_pred, return_counts=True)

for value, count in zip(values, counts):
    print(f'Value: {value}, Count: {count}')

Value: 0, Count: 7083
Value: 1, Count: 1155


# Submit

In [72]:
df_submit = pd.read_csv('/content/drive/MyDrive/패턴인식/Pr_project/test.csv')
df_submit['y_predict'] = test_pred

In [74]:
df_submit.groupby('y_predict').size()

y_predict
0    7083
1    1155
dtype: int64

In [78]:
df_submit.to_csv('/content/drive/MyDrive/패턴인식/Pr_project/prediction.csv')