# Benchmark at Kaggle competition

In [1]:
import numpy as np
import pandas as pd

Read train dataset:

In [2]:
train = pd.read_csv('train.csv', sep=',')
train.head()

Unnamed: 0,target,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13
0,1,0.0,3.51,0.04,0.0,2.39,203.0,7.95,23.71,124.0,143,148,1,2
1,0,0.27,8.18,3.27,6.0,4.17,108.0,10.24,14.84,83.42,135,203,1,1
2,0,0.34,5.32,0.07,2.0,4.06,77.0,24.2,15.8,66.29,115,35,3,2
3,1,,2.77,2.2,7.0,3.34,134.0,7.23,9.56,92.35,169,181,3,3
4,0,0.62,3.97,2.92,7.0,5.3,25.0,17.8,11.71,33.09,168,51,2,2


Note that some columns do have missing values:

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   target     1000 non-null   int64  
 1   feature1   981 non-null    float64
 2   feature2   1000 non-null   float64
 3   feature3   982 non-null    float64
 4   feature4   978 non-null    float64
 5   feature5   983 non-null    float64
 6   feature6   893 non-null    float64
 7   feature7   1000 non-null   float64
 8   feature8   1000 non-null   float64
 9   feature9   1000 non-null   float64
 10  feature10  1000 non-null   int64  
 11  feature11  1000 non-null   int64  
 12  feature12  1000 non-null   int64  
 13  feature13  1000 non-null   int64  
dtypes: float64(9), int64(5)
memory usage: 109.5 KB


Extract target column:

In [4]:
y = train['target']
X = train.drop(columns=['target'])
X.head()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13
0,0.0,3.51,0.04,0.0,2.39,203.0,7.95,23.71,124.0,143,148,1,2
1,0.27,8.18,3.27,6.0,4.17,108.0,10.24,14.84,83.42,135,203,1,1
2,0.34,5.32,0.07,2.0,4.06,77.0,24.2,15.8,66.29,115,35,3,2
3,,2.77,2.2,7.0,3.34,134.0,7.23,9.56,92.35,169,181,3,3
4,0.62,3.97,2.92,7.0,5.3,25.0,17.8,11.71,33.09,168,51,2,2


Fill missing values:

In [5]:
X.ffill(inplace=True)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   feature1   1000 non-null   float64
 1   feature2   1000 non-null   float64
 2   feature3   1000 non-null   float64
 3   feature4   1000 non-null   float64
 4   feature5   1000 non-null   float64
 5   feature6   1000 non-null   float64
 6   feature7   1000 non-null   float64
 7   feature8   1000 non-null   float64
 8   feature9   1000 non-null   float64
 9   feature10  1000 non-null   int64  
 10  feature11  1000 non-null   int64  
 11  feature12  1000 non-null   int64  
 12  feature13  1000 non-null   int64  
dtypes: float64(9), int64(4)
memory usage: 101.7 KB


All columns are numeric:

In [6]:
X.describe()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.49763,5.08742,2.4737,3.492,4.97974,110.16,14.74218,14.86288,80.50638,112.374,109.954,1.703,2.105
std,0.290866,2.834132,1.466324,2.291984,1.479153,58.599456,5.720165,5.843719,29.346648,57.286612,58.673883,0.785753,0.711675
min,0.0,0.0,0.0,0.0,2.06,10.0,5.04,5.05,25.77,10.0,10.0,1.0,1.0
25%,0.24,2.665,1.1675,1.0,3.6875,59.75,9.5775,9.775,55.6025,65.0,60.0,1.0,2.0
50%,0.5,5.13,2.475,3.0,4.97,111.0,14.825,14.76,81.915,113.0,109.0,1.0,2.0
75%,0.74,7.51,3.6925,5.0,6.2325,161.0,19.5725,19.99,105.9875,162.0,161.0,2.0,3.0
max,1.0,9.99,5.0,7.0,7.94,207.0,24.99,24.97,133.44,209.0,209.0,3.0,3.0


Write a function to evaluate a model via cross-validation:

In [7]:
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

def eval_classifier(clf):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=43)
    n_scores = cross_val_score(clf, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return np.mean(n_scores), np.std(n_scores)

Try three models: logistic regression, decistion tree, support vector machines

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [9]:
acc_mean, acc_std = eval_classifier(LogisticRegression(max_iter=2000))

print(f"Logistic regression: mean accuracy = {acc_mean:.5f}, std = {acc_std:.3f}")

Logistic regression: mean accuracy = 0.72667, std = 0.027


In [10]:
acc_mean, acc_std = eval_classifier(DecisionTreeClassifier())

print(f"Decision tree: mean accuracy = {acc_mean:.5f}, std = {acc_std:.3f}")

Decision tree: mean accuracy = 0.84000, std = 0.030


In [11]:
acc_mean, acc_std = eval_classifier(SVC(probability=True))

print(f"SVM: mean accuracy = {acc_mean:.5f}, std = {acc_std:.3f}")

SVM: mean accuracy = 0.54700, std = 0.013


# Test

Test part expectedly does not contain target column:

In [12]:
test = pd.read_csv('test.csv', sep=',')
test.head()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,Id
0,0.42,8.27,0.35,1.0,4.57,22.0,20.74,5.25,39.23,25,39,2,1,0
1,0.65,4.13,2.84,6.0,5.71,149.0,15.86,9.32,102.48,84,150,2,1,1
2,0.44,1.95,1.08,3.0,4.78,95.0,14.55,19.81,74.24,206,64,1,2,2
3,0.71,1.31,2.51,7.0,5.58,158.0,23.06,8.44,103.27,209,162,2,2,3
4,0.38,5.66,3.85,,4.73,91.0,13.9,7.74,66.64,124,44,2,3,4


Fill missing values in test dataset as well:

In [13]:
X_test = test.drop(columns=['Id'])

X_test = X_test.ffill()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   feature1   1000 non-null   float64
 1   feature2   1000 non-null   float64
 2   feature3   1000 non-null   float64
 3   feature4   1000 non-null   float64
 4   feature5   1000 non-null   float64
 5   feature6   1000 non-null   float64
 6   feature7   1000 non-null   float64
 7   feature8   1000 non-null   float64
 8   feature9   1000 non-null   float64
 9   feature10  1000 non-null   int64  
 10  feature11  1000 non-null   int64  
 11  feature12  1000 non-null   int64  
 12  feature13  1000 non-null   int64  
dtypes: float64(9), int64(4)
memory usage: 101.7 KB


Fit a decision tree to train data and make predictions on test:

In [18]:
model = DecisionTreeClassifier()

model.fit(X, y)

ans = model.predict(X_test)

ans

array([1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,

Load sample submission file and replace target column by our predictions:

In [16]:
sample = pd.read_csv('sample.csv', sep=',')
sample['target'] = ans
sample.head(10)

Unnamed: 0,target,Id
0,0,0
1,0,1
2,0,2
3,1,3
4,0,4
5,1,5
6,0,6
7,1,7
8,0,8
9,0,9


Save to a `csv` file which can be submitted to kaggle:

In [17]:
sample.to_csv('SVM_benchmark.csv', index=False)