In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [12]:
df_train = pd.read_csv('./wine_train.csv', index_col=0)
df_test = pd.read_csv('./wine_test.csv', index_col=0)
df_train.head()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,8.0,0.5,0.39,2.2,0.073,30.0,39.0,0.99572,3.33,0.77,12.1,6
1,9.3,0.3,0.73,2.3,0.092,30.0,67.0,0.99854,3.32,0.67,12.8,6
2,7.1,0.51,0.03,2.1,0.059,3.0,12.0,0.9966,3.52,0.73,11.3,7
3,8.1,0.87,0.22,2.6,0.084,11.0,65.0,0.9973,3.2,0.53,9.8,5
4,8.5,0.36,0.3,2.3,0.079,10.0,45.0,0.99444,3.2,1.36,9.5,6


In [4]:
df_train.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [5]:
df_train['quality'].unique()

array([6, 7, 5, 4, 8, 3])

In [6]:
X_train = df_train.drop('quality', axis=1)
y_train = df_train['quality']

In [7]:
X_test = df_test

In [8]:
pipe1 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVC(probability=True, random_state=42))
])
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

### SVM with Standard Scaler and linear Kernel

In [None]:
params1 = {
    'scaler': [StandardScaler()],
    'clf__C': np.linspace(0.001, 5, 10),
    'clf__kernel': ['linear'],
    'clf__decision_function_shape': ['ovo', 'ovr']
}
gcv1 = GridSearchCV(pipe1, params1, cv=kfold, scoring='neg_log_loss')
gcv1.fit(X_train, y_train)

print(f"Best parameters for SVM in linear on Y: {gcv1.best_params_}")
print(f"Best score for SVM in linear on Y: {gcv1.best_score_}")

Best parameters for SVM without scaling in linear on Y: {'clf__C': 2.778222222222222, 'clf__decision_function_shape': 'ovo', 'clf__kernel': 'linear', 'scaler': StandardScaler()}
Best score for SVM without scaling in linear on Y: -1.0559353409491075


In [None]:
bm1 = gcv1.best_estimator_
y_pred1 = bm1.predict(X_test)
df1 = pd.DataFrame({
    'Id': X_test.index,
    'quality': y_pred1
})
df1.to_csv('submission.csv', index=False)

### SVM with MinMax Scaler and linear Kernel

In [14]:
params2 = {
    'scaler': [MinMaxScaler()],
    'clf__C': np.linspace(0.001, 5, 10),
    'clf__kernel': ['linear'],
    'clf__decision_function_shape': ['ovo', 'ovr']
}
gcv2 = GridSearchCV(pipe1, params2, cv=kfold, scoring='neg_log_loss')
gcv2.fit(X_train, y_train)

print(f"Best parameters for SVM in linear with MinMax scaler on Y: {gcv2.best_params_}")
print(f"Best score for SVM in linear with MinMax scaler on Y: {gcv2.best_score_}")

Best parameters for SVM in linear with MinMax scaler on Y: {'clf__C': 2.2227777777777775, 'clf__decision_function_shape': 'ovo', 'clf__kernel': 'linear', 'scaler': MinMaxScaler()}
Best score for SVM in linear with MinMax scaler on Y: -1.0549571476474262


In [15]:
bm2 = gcv2.best_estimator_
y_pred2 = bm2.predict(X_test)
df2 = pd.DataFrame({
    'Id': X_test.index,
    'quality': y_pred2
})
df2.to_csv('submission2.csv', index=False)

### SVM with Standard Scaler and RBF Kernel

In [9]:
params3 = {
    'scaler': [StandardScaler()],
    'clf__C': np.linspace(0.001, 5, 10),
    'clf__gamma': list(np.linspace(0.001, 5, 10)) + ['scale', 'auto'],
    'clf__decision_function_shape': ['ovo', 'ovr']
}
gcv3 = GridSearchCV(pipe1, params3, cv=kfold, scoring='neg_log_loss')
gcv3.fit(X_train, y_train)

print(f"Best parameters for SVM in RBF with Satndard scaler on Y: {gcv3.best_params_}")
print(f"Best score for SVM in RBF with Satndard scaler on Y: {gcv3.best_score_}")

Best parameters for SVM in RBF with Satndard scaler on Y: {'clf__C': 0.5564444444444444, 'clf__decision_function_shape': 'ovo', 'clf__gamma': 'scale', 'scaler': StandardScaler()}
Best score for SVM in RBF with Satndard scaler on Y: -1.0425974678103647


In [13]:
bm3 = gcv3.best_estimator_
y_pred3 = bm3.predict(X_test)
df3 = pd.DataFrame({
    'Id': X_test.index,
    'quality': y_pred3
})
df3.to_csv('submission3.csv', index=False)

### SVM with MinMax Scaler and RBF Kernel

In [16]:
params4 = {
    'scaler': [MinMaxScaler()],
    'clf__C': np.linspace(0.001, 5, 10),
    'clf__gamma': list(np.linspace(0.001, 5, 10)) + ['scale', 'auto'],
    'clf__decision_function_shape': ['ovo', 'ovr']
}
gcv4 = GridSearchCV(pipe1, params4, cv=kfold, scoring='neg_log_loss')
gcv4.fit(X_train, y_train)

print(f"Best parameters for SVM in RBF with MinMax scaler on Y: {gcv4.best_params_}")
print(f"Best score for SVM in RBF with MinMax scaler on Y: {gcv4.best_score_}")

Best parameters for SVM in RBF with MinMax scaler on Y: {'clf__C': 5.0, 'clf__decision_function_shape': 'ovo', 'clf__gamma': 0.5564444444444444, 'scaler': MinMaxScaler()}
Best score for SVM in RBF with MinMax scaler on Y: -1.035477761822175


In [17]:
bm4 = gcv4.best_estimator_
y_pred4 = bm4.predict(X_test)
df4 = pd.DataFrame({
    'Id': X_test.index,
    'quality': y_pred4
})
df4.to_csv('submission4.csv', index=False)

Leaderboard Score:-

1. Linear SVM with Standard Scaler - 0.43906
2. Linear SVM with MinMax Scaler - 0.43653
3. RBF SVM with Standard Scaler - 0.50007
4. RBF SVM with MinMax Scaler - 0.47177