In [1]:
import os
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [123]:
data_path = ''

#kaggle
if (os.environ.get('KAGGLE_KERNEL_RUN_TYPE','')) or (os.environ.get('KAGGLE_KERNEL_RUN_TYPE','') == 'Interactive') or (os.environ.get('KAGGLE_KERNEL_RUN_TYPE','') == 'Batch'):
    print("Kaggle")  
    data_path = '/kaggle/input'    
    
#google colab
elif 'google.colab' in sys.modules:
    print('Google Colab')

    from google.colab import drive
    
    drive = drive.mount('/content/drive/')
    data_path = f'/content/drive/My Drive/Colab_Data/kaggle/playground-series-s3e5'
    
#localhost
elif os.environ.get('KAGGLE_KERNEL_RUN_TYPE','') == '':
    print('Localhost')
    data_path = './data'
    

Localhost


In [124]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./data/y_baseline_output.csv
./data/simple_lr_output.csv
./data/test.csv
./data/simple_lr_scaled_output.csv
./data/linear_svc_scaling_output.csv
./data/simple_lr_no_scaling_output.csv
./data/train.csv
./data/sample_submission.csv


In [4]:
train_df = pd.read_csv(f'{data_path}/train.csv')#, index_col='Id')
test_df = pd.read_csv(f'{data_path}/test.csv')#, index_col='Id')

train_df.shape, test_df.shape

((2056, 13), (1372, 12))

In [5]:
train_df.dtypes

Id                        int64
fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [6]:
test_df

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,2056,7.2,0.510,0.01,2.0,0.077,31.0,54.0,0.99748,3.39,0.59,9.8
1,2057,7.2,0.755,0.15,2.0,0.102,14.0,35.0,0.99586,3.33,0.68,10.0
2,2058,8.4,0.460,0.40,2.0,0.065,21.0,50.0,0.99774,3.08,0.65,9.5
3,2059,8.0,0.470,0.40,1.8,0.056,14.0,25.0,0.99480,3.30,0.65,11.7
4,2060,6.5,0.340,0.32,2.1,0.044,8.0,94.0,0.99356,3.23,0.48,12.8
...,...,...,...,...,...,...,...,...,...,...,...,...
1367,3423,8.8,0.745,0.18,2.7,0.084,41.0,115.0,0.99823,3.38,0.70,9.8
1368,3424,15.6,0.240,0.55,2.9,0.062,11.0,25.0,0.99724,2.99,0.77,10.1
1369,3425,7.3,0.760,0.00,2.2,0.095,6.0,19.0,0.99880,3.67,0.60,9.4
1370,3426,7.6,0.780,0.26,2.6,0.118,17.0,104.0,0.99616,3.30,0.53,9.9


In [7]:
train_df.describe()

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0
mean,1027.5,8.365175,0.527601,0.265058,2.398881,0.081856,16.955982,49.236868,0.996748,3.310569,0.641308,10.414972,5.720817
std,593.660397,1.70539,0.173164,0.188267,0.858824,0.023729,10.00971,32.961141,0.001827,0.142321,0.137942,1.028825,0.853146
min,0.0,5.0,0.18,0.0,1.2,0.012,1.0,7.0,0.99007,2.74,0.39,8.7,3.0
25%,513.75,7.2,0.39,0.09,1.9,0.071,8.0,22.0,0.9956,3.2,0.55,9.5,5.0
50%,1027.5,7.95,0.52,0.25,2.2,0.079,16.0,44.0,0.9967,3.31,0.61,10.1,6.0
75%,1541.25,9.2,0.64,0.42,2.6,0.09,24.0,65.0,0.9978,3.39,0.72,11.0,6.0
max,2055.0,15.9,1.58,0.76,14.0,0.414,68.0,289.0,1.00369,3.78,1.95,14.0,8.0


In [8]:
from sklearn.metrics import cohen_kappa_score



In [106]:
# split into train and validation sets
X_train = train_df.copy().drop(columns=['Id', 'quality'])
y_train = train_df.copy()['quality']

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
X_test = test_df.copy().drop(columns=['Id'])

In [103]:
X_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,11.9,0.685,0.49,2.6,0.085,5.0,19.0,0.99820,3.02,0.49,10.1
1,7.6,0.560,0.28,2.4,0.077,16.0,86.0,0.99690,3.22,0.64,9.8
2,9.9,0.450,0.24,1.7,0.081,37.5,53.0,0.99692,3.19,0.44,9.5
3,6.8,0.640,0.21,2.1,0.093,9.0,32.0,0.99745,3.68,0.73,9.5
4,7.8,0.690,0.06,1.8,0.078,31.0,38.0,0.99504,3.39,0.93,11.9
...,...,...,...,...,...,...,...,...,...,...,...
1639,6.6,0.340,0.29,2.4,0.098,15.0,33.0,0.99408,3.44,0.62,11.9
1640,7.1,0.510,0.17,2.1,0.077,25.0,85.0,0.99524,3.46,0.59,10.1
1641,6.9,0.360,0.31,1.9,0.074,26.0,65.0,0.99387,3.35,0.82,11.9
1642,11.3,0.180,0.40,3.4,0.066,25.0,60.0,0.99860,3.28,0.71,11.8


In [104]:
X_val

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.7,0.60,0.00,1.7,0.089,16.0,30.0,0.99538,3.36,0.44,10.0
1,7.0,0.69,0.10,3.0,0.088,33.0,76.0,0.99842,3.44,0.60,9.9
2,7.1,0.58,0.02,2.1,0.063,17.0,53.0,0.99572,3.52,0.72,10.9
3,7.2,0.48,0.30,1.9,0.068,20.0,110.0,0.99900,3.30,1.15,9.0
4,7.5,0.66,0.00,2.6,0.123,6.0,15.0,0.99631,3.38,0.53,10.9
...,...,...,...,...,...,...,...,...,...,...,...
407,9.0,0.42,0.30,2.1,0.076,26.0,42.0,0.99840,3.38,0.69,10.8
408,11.5,0.38,0.56,2.6,0.087,6.0,24.0,0.99940,3.14,0.62,10.7
409,7.6,0.36,0.33,2.1,0.078,27.0,66.0,0.99745,3.40,0.61,9.5
410,8.3,0.70,0.24,2.2,0.100,5.0,15.0,0.99760,3.12,0.49,9.3


In [107]:
X_test

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.2,0.510,0.01,2.0,0.077,31.0,54.0,0.99748,3.39,0.59,9.8
1,7.2,0.755,0.15,2.0,0.102,14.0,35.0,0.99586,3.33,0.68,10.0
2,8.4,0.460,0.40,2.0,0.065,21.0,50.0,0.99774,3.08,0.65,9.5
3,8.0,0.470,0.40,1.8,0.056,14.0,25.0,0.99480,3.30,0.65,11.7
4,6.5,0.340,0.32,2.1,0.044,8.0,94.0,0.99356,3.23,0.48,12.8
...,...,...,...,...,...,...,...,...,...,...,...
1367,8.8,0.745,0.18,2.7,0.084,41.0,115.0,0.99823,3.38,0.70,9.8
1368,15.6,0.240,0.55,2.9,0.062,11.0,25.0,0.99724,2.99,0.77,10.1
1369,7.3,0.760,0.00,2.2,0.095,6.0,19.0,0.99880,3.67,0.60,9.4
1370,7.6,0.780,0.26,2.6,0.118,17.0,104.0,0.99616,3.30,0.53,9.9


## Baseline Model

In [108]:
#assume mean train price for all MedHouseVal
train_mean_quality = train_df['quality'].mean()
train_mean_quality

y_pred_train_baseline = np.full((X_train.shape[0], 1), train_mean_quality, dtype=int)
y_pred_val_baseline = np.full((X_val.shape[0], 1), train_mean_quality, dtype=int)
y_pred_test_baseline = np.full((test_df.shape[0], 1), train_mean_quality, dtype=int)

y_pred_test_baseline.shape, y_pred_val_baseline.shape

((1372, 1), (412, 1))

In [109]:
print(f'y_val:{type(y_val)}::{y_val.shape}, y_pred_val_baseline:{type(y_pred_val_baseline)}::{y_pred_val_baseline.shape}')
val_baseline_score = cohen_kappa_score(y_val, y_pred_val_baseline)

print(f'val_baseline_score: {val_baseline_score}')

y_val:<class 'pandas.core.series.Series'>::(412,), y_pred_val_baseline:<class 'numpy.ndarray'>::(412, 1)
val_baseline_score: 0.0


In [110]:
#build the baseline submission for test set to get score
y_baseline_output = pd.DataFrame({'Id':test_df['Id'], 'quality':y_pred_test_baseline[:,0]})
y_baseline_output
y_baseline_output.to_csv(f'{data_path}/y_baseline_output.csv', index=False)
# Score: 0

## EDA

In [112]:
# train_df.columns

In [34]:
# plt.boxplot(train_df['fixed acidity']);

In [35]:
# plt.boxplot(train_df['free sulfur dioxide']);

In [36]:
# plt.boxplot(train_df['chlorides']);

In [37]:
# train_df.hist(bins=50, figsize=(15,15));

In [113]:
# scaler = RobustScaler()
scaler = StandardScaler()
# scaler = MinMaxScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# X_test_scaled.hist(bins=50, figsize=(15,15));


In [114]:
X_train.shape, X_test.shape

((1644, 11), (1372, 11))

## Basic LogisticRegression

In [115]:
# y_val, val_lr_pred_scaled

In [116]:
simple_lr = LogisticRegression(max_iter=10000)
simple_lr.fit(X_train, y_train)

val_lr_pred = simple_lr.predict(X_val)
val_lr_score = cohen_kappa_score(y_val, val_lr_pred)
print(f'val_lr_score (not scaled): {val_lr_score}')

#------------

simple_lr = LogisticRegression(max_iter=10000)
simple_lr.fit(X_train_scaled, y_train)

val_lr_pred_scaled = simple_lr.predict(X_val_scaled)
val_lr_score_scaled = cohen_kappa_score(y_val, val_lr_pred_scaled)
print(f'val_lr_score_scaled (scaled): {val_lr_score_scaled}')


val_lr_score (not scaled): 0.3049476555104642
val_lr_score_scaled (scaled): 0.30220458883856705


In [117]:
#build the simple logistic regression submission for test set to get score

y_test_simple_lr_pred = simple_lr.predict(X_test)
y_test_simple_lr_output = pd.DataFrame({'Id':test_df['Id'], 'quality':y_test_simple_lr_pred})

y_test_simple_lr_output.to_csv(f'{data_path}/simple_lr_no_scaling_output.csv', index=False)
# y_test_simple_lr_output
# Score: 0.53066

In [118]:
simple_lr = LogisticRegression(max_iter=10000)
simple_lr.fit(X_train_scaled, y_train)

y_test_simple_lr_pred = simple_lr.predict(X_test_scaled)
y_test_simple_lr_output = pd.DataFrame({'Id':test_df['Id'], 'quality':y_test_simple_lr_pred})

y_test_simple_lr_output.to_csv(f'{data_path}/simple_lr_scaled_output.csv', index=False)
# y_test_simple_lr_output

#RobustScaler Score: 0.5712
#StandardScaler Score: 0.57352
#MinMaxScaler Score: ??

## LinearSVC

In [122]:
from sklearn.svm import LinearSVC

linearsvc = LinearSVC(max_iter=10000)
linearsvc.fit(X_train, y_train)

linearsvc_pred = simple_lr.predict(X_val)
linearsvc_score = cohen_kappa_score(y_val, linearsvc_pred)
print(f'linearsvc_score (not scaled): {linearsvc_score}')

#------------

linearsvc = LinearSVC(max_iter=10000)
linearsvc.fit(X_train_scaled, y_train)

linearsvc_pred = simple_lr.predict(X_val_scaled)
linearsvc_score = cohen_kappa_score(y_val, linearsvc_pred)
print(f'linearsvc_score (scaled): {linearsvc_score}')



linearsvc_score (not scaled): 0.101348958501014
linearsvc_score (scaled): 0.30220458883856705


In [120]:
#build the simple logistic regression submission for test set to get score

linearsvc = LinearSVC(max_iter=10000)
linearsvc.fit(X_train_scaled, y_train)

y_test_linearsvc_pred = linearsvc.predict(X_test_scaled)
y_test_linearsvc_output = pd.DataFrame({'Id':test_df['Id'], 'quality':y_test_linearsvc_pred})

y_test_linearsvc_output.to_csv(f'{data_path}/linear_svc_scaling_output.csv', index=False)
# Score: Score: 

## KNN

In [121]:
from sklearn.neighbors import KNeighborsClassifier

n_neighbors = 60
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)

knn_pred = knn.predict(X_val)
knn_score = cohen_kappa_score(y_val, knn_pred)
print(f'knn_score (not scaled): {knn_score}')

#------------

knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train_scaled, y_train)

knn_pred = knn.predict(X_val_scaled)
knn_score = cohen_kappa_score(y_val, knn_pred)
print(f'knn_score (scaled): {knn_score}')

knn_score (not scaled): 0.17282586768364738
knn_score (scaled): 0.31042746421948564
