In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [38]:
BASE = 'D:\AI1606\Ky_5\AIL302m\Kaggle\Data\\'
DIR_RESULT = 'D:\AI1606\Ky_5\AIL302m\Kaggle\extraTree\\'

## Read data

In [39]:
# read data
train = pd.read_csv(BASE + 'train.csv',delimiter=';')
test = pd.read_csv(BASE + 'test.csv',delimiter=';')
# # get list id in test  
# id_list = test['id']
# test.drop('id',axis=1,inplace=True)




In [40]:
train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,6.6,0.3,0.36,1.2,0.035,43,126,0.9909,3.01,0.63,11.4,6,white
1,7.7,0.5,0.26,1.9,0.062,9,31,0.9966,3.39,0.64,9.6,5,red
2,8.4,0.5,0.35,2.9,0.076,21,127,0.9976,3.23,0.63,9.2,5,red
3,7.5,0.4,0.33,5.0,0.045,30,131,0.9942,3.32,0.44,10.9,6,white
4,6.4,0.2,0.25,20.2,0.083,35,157,0.9998,3.17,0.5,9.1,5,white


In [41]:
test.head()

Unnamed: 0,id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,1257,7.2,0.25,0.37,2.5,0.063,11.0,41.0,0.99439,3.52,0.8,12.4,red
1,6409,8.2,0.27,0.39,7.8,0.039,49.0,208.0,0.9976,3.31,0.51,9.5,white
2,136,8.9,0.32,0.49,1.6,0.05,17.0,131.0,0.9956,3.13,0.34,9.4,white
3,1631,7.4,0.16,0.3,13.7,0.056,33.0,168.0,0.99825,2.9,0.44,8.7,white
4,6084,6.4,0.28,0.56,1.7,0.156,49.0,106.0,0.99354,3.1,0.37,9.2,white


In [42]:
def create_new_feature(data):
    # combine fixed and volatile acidity to create total acidity
    # and mean acidity
    acidity_features = ['fixed acidity', 'volatile acidity']
    data['total_acidity'] = data[acidity_features].sum(axis=1)
    data['average_acidity'] = data[acidity_features].mean(axis=1)
    
    #combine salts into total minerals and average minerals
    salt_features = ['chlorides','sulphates']
    data['total_minerals'] = data[salt_features].sum(axis=1)
    data['average_minerals'] = data[salt_features].mean(axis=1)
    
    #the sulfur that is not free
    sulfur_features = ['total sulfur dioxide','free sulfur dioxide']
    data['non_free_sulfur_dioxide'] = data[sulfur_features[0]] - data[sulfur_features[1]]
    
    #percentage of free sulfur
    data['percentage_free_sulfur'] = data[sulfur_features[1]] / data[sulfur_features[0]]
    
    #determine from all free sulfur how much is as salt
    data['percentage_salt_sulfur'] = data['sulphates'] / data['free sulfur dioxide']
    return data

In [43]:
train = create_new_feature(train)
test = create_new_feature(test)

In [44]:
train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type,total_acidity,average_acidity,total_minerals,average_minerals,non_free_sulfur_dioxide,percentage_free_sulfur,percentage_salt_sulfur
0,6.6,0.3,0.36,1.2,0.035,43,126,0.9909,3.01,0.63,11.4,6,white,6.9,3.45,0.665,0.3325,83,0.34127,0.014651
1,7.7,0.5,0.26,1.9,0.062,9,31,0.9966,3.39,0.64,9.6,5,red,8.2,4.1,0.702,0.351,22,0.290323,0.071111
2,8.4,0.5,0.35,2.9,0.076,21,127,0.9976,3.23,0.63,9.2,5,red,8.9,4.45,0.706,0.353,106,0.165354,0.03
3,7.5,0.4,0.33,5.0,0.045,30,131,0.9942,3.32,0.44,10.9,6,white,7.9,3.95,0.485,0.2425,101,0.229008,0.014667
4,6.4,0.2,0.25,20.2,0.083,35,157,0.9998,3.17,0.5,9.1,5,white,6.6,3.3,0.583,0.2915,122,0.22293,0.014286


In [45]:
train.corr()['quality']

fixed acidity             -0.059680
volatile acidity          -0.260352
citric acid                0.090887
residual sugar            -0.027913
chlorides                 -0.194367
free sulfur dioxide        0.067592
total sulfur dioxide      -0.028990
density                   -0.298328
pH                         0.003706
sulphates                  0.023269
alcohol                    0.443145
quality                    1.000000
total_acidity             -0.089441
average_acidity           -0.089441
total_minerals            -0.021039
average_minerals          -0.021039
non_free_sulfur_dioxide   -0.062551
percentage_free_sulfur     0.121648
percentage_salt_sulfur    -0.059451
Name: quality, dtype: float64

In [46]:
train['type'].value_counts()
test['type'].value_counts()

white    587
red      233
Name: type, dtype: int64

In [47]:
# label encoding
label = LabelEncoder()
train['type'] = label.fit_transform(train[['type']])
train['type'].value_counts()
test['type'] = label.transform(test[['type']])
test['type'].value_counts()


1    587
0    233
Name: type, dtype: int64

In [48]:
# # extract target, y
# target = 'quality'
# y = train[target]
# train.drop(target,axis=1,inplace=True)
# y.shape

In [49]:
train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type,total_acidity,average_acidity,total_minerals,average_minerals,non_free_sulfur_dioxide,percentage_free_sulfur,percentage_salt_sulfur
0,6.6,0.3,0.36,1.2,0.035,43,126,0.9909,3.01,0.63,11.4,6,1,6.9,3.45,0.665,0.3325,83,0.341270,0.014651
1,7.7,0.5,0.26,1.9,0.062,9,31,0.9966,3.39,0.64,9.6,5,0,8.2,4.10,0.702,0.3510,22,0.290323,0.071111
2,8.4,0.5,0.35,2.9,0.076,21,127,0.9976,3.23,0.63,9.2,5,0,8.9,4.45,0.706,0.3530,106,0.165354,0.030000
3,7.5,0.4,0.33,5.0,0.045,30,131,0.9942,3.32,0.44,10.9,6,1,7.9,3.95,0.485,0.2425,101,0.229008,0.014667
4,6.4,0.2,0.25,20.2,0.083,35,157,0.9998,3.17,0.50,9.1,5,1,6.6,3.30,0.583,0.2915,122,0.222930,0.014286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6709,7.2,0.2,0.19,7.7,0.045,53,176,0.9958,3.17,0.38,9.5,5,1,7.4,3.70,0.425,0.2125,123,0.301136,0.007170
6710,6.7,0.3,0.34,7.5,0.036,39,124,0.9912,2.99,0.32,12.4,8,1,7.0,3.50,0.356,0.1780,85,0.314516,0.008205
6711,6.6,0.3,0.24,3.3,0.034,29,99,0.9903,3.10,0.40,12.3,7,1,6.9,3.45,0.434,0.2170,70,0.292929,0.013793
6712,8.0,0.2,0.31,5.6,0.049,24,97,0.9930,3.10,0.42,10.9,5,1,8.2,4.10,0.469,0.2345,73,0.247423,0.017500


In [50]:
# train_drop_duplicate = train.drop_duplicates().reset_index(drop=True)
# test_drop_duplicate = test.drop_duplicates().reset_index(drop=True)

In [51]:
train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type,total_acidity,average_acidity,total_minerals,average_minerals,non_free_sulfur_dioxide,percentage_free_sulfur,percentage_salt_sulfur
0,6.6,0.3,0.36,1.2,0.035,43,126,0.9909,3.01,0.63,11.4,6,1,6.9,3.45,0.665,0.3325,83,0.341270,0.014651
1,7.7,0.5,0.26,1.9,0.062,9,31,0.9966,3.39,0.64,9.6,5,0,8.2,4.10,0.702,0.3510,22,0.290323,0.071111
2,8.4,0.5,0.35,2.9,0.076,21,127,0.9976,3.23,0.63,9.2,5,0,8.9,4.45,0.706,0.3530,106,0.165354,0.030000
3,7.5,0.4,0.33,5.0,0.045,30,131,0.9942,3.32,0.44,10.9,6,1,7.9,3.95,0.485,0.2425,101,0.229008,0.014667
4,6.4,0.2,0.25,20.2,0.083,35,157,0.9998,3.17,0.50,9.1,5,1,6.6,3.30,0.583,0.2915,122,0.222930,0.014286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6709,7.2,0.2,0.19,7.7,0.045,53,176,0.9958,3.17,0.38,9.5,5,1,7.4,3.70,0.425,0.2125,123,0.301136,0.007170
6710,6.7,0.3,0.34,7.5,0.036,39,124,0.9912,2.99,0.32,12.4,8,1,7.0,3.50,0.356,0.1780,85,0.314516,0.008205
6711,6.6,0.3,0.24,3.3,0.034,29,99,0.9903,3.10,0.40,12.3,7,1,6.9,3.45,0.434,0.2170,70,0.292929,0.013793
6712,8.0,0.2,0.31,5.6,0.049,24,97,0.9930,3.10,0.42,10.9,5,1,8.2,4.10,0.469,0.2345,73,0.247423,0.017500


In [52]:
# # merge train and test
# merge_data = pd.concat([train_drop_duplicate,test_drop_duplicate],axis=0)
# merge_data.shape

In [53]:
# # Normalization data
# from sklearn.preprocessing import StandardScaler
# normal = StandardScaler()
# col_normal_merge = [col for col in merge_data.columns if col != 'quality' and col != 'type' and col != 'id']
# col_normal_train = [col for col in train.columns if col != 'quality' and col != 'type']
# col_normal_test = [col for col in test.columns if col != 'id' and col != 'type']
# normal.fit(merge_data[col_normal_merge])
# train[col_normal_train] = normal.transform(train[col_normal_train])
# test[col_normal_test] = normal.transform(test[col_normal_test])


# Processing Imblaced

In [54]:
# #remove data in train with quality = 9
# train = train[train['quality'] != 9].reset_index(drop=True)

In [206]:
# drop train dup
train = train.drop_duplicates().reset_index(drop=True)

In [56]:
# processing imbalanced data using smogn
from smogn import smoter
train = smoter(data=train, y='quality', k=5, samp_method = "balance")

dist_matrix: 100%|##########| 1054/1054 [06:29<00:00,  2.71it/s]
r_index: 100%|##########| 818/818 [00:01<00:00, 698.40it/s]


In [57]:
AFTER_PROCESS = 'D:\AI1606\Ky_5\AIL302m\Kaggle\Data processed\\'

In [58]:
# train.to_csv(AFTER_PROCESS+'train_processed_drop_duplicate_train_smogn.csv',index=False)
# test.to_csv(AFTER_PROCESS+'test_processed_drop_duplicate_train_smogn.csv',index=False)

In [229]:
#read train and test
train = pd.read_csv(AFTER_PROCESS+'train_processed.csv')
test = pd.read_csv(AFTER_PROCESS+'test_processed.csv')
train.shape,test.shape

((9903, 20), (820, 19))

In [276]:
# train.shape, y.shape

## Build Model

In [230]:
train.isna().sum()

fixed acidity              0
volatile acidity           0
citric acid                0
residual sugar             0
chlorides                  0
free sulfur dioxide        0
total sulfur dioxide       0
density                    0
pH                         0
sulphates                  0
alcohol                    0
quality                    0
type                       0
total_acidity              0
average_acidity            0
total_minerals             0
average_minerals           0
non_free_sulfur_dioxide    0
percentage_free_sulfur     0
percentage_salt_sulfur     0
dtype: int64

In [231]:
#drop id from test
id_list = test['id']
test.drop('id',axis=1,inplace=True)

KeyError: 'id'

### Extratree regressor

In [232]:
#convert quality to int
train['quality'] = train['quality'].astype(int)

In [284]:
# extract target, y
target = 'quality'
y = train[target]
train.drop(target,axis=1,inplace=True)

In [285]:
# Get data for train and val
X_train,X_val,y_train,y_val = train_test_split(train,y,test_size=0.2,random_state=198)

In [286]:
X_train.shape,X_val.shape,y_train.shape,y_val.shape

((9777, 19), (2445, 19), (9777,), (2445,))

In [275]:
train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type,total_acidity,average_acidity,total_minerals,average_minerals,non_free_sulfur_dioxide,percentage_free_sulfur,percentage_salt_sulfur
0,0.000000,0.000000,0.064818,0.000000,0.000000,0.000000,0.437043,0.000000,0.126027,0.000000,0.000000,0.604032,0.000000,0.000000,0.000000,0.000000,0.722394,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.456539,0.000000,0.260610,0.055943,0.000000,0.000000,0.000000,1.356830,0.604032,0.000000,0.000000,0.000000,0.000000,0.000000,0.084122,0.000000
2,0.000000,0.000000,0.264896,0.000000,0.000000,0.968397,0.000000,0.000000,0.000000,0.000000,0.665936,0.604032,0.000000,0.000000,0.000000,0.000000,0.000000,1.100650,0.000000
3,0.000000,0.000000,0.147878,1.941932,0.000000,0.940648,0.611547,0.076943,0.998741,0.000000,1.746705,0.609792,0.000000,0.000000,0.000000,0.000000,0.402492,0.167292,0.000000
4,0.000000,0.000000,0.332393,0.000000,0.000000,0.441223,0.472506,0.000000,0.109238,0.000000,0.695920,0.604032,0.000000,0.000000,0.000000,0.000000,0.418869,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11637,-0.114772,-0.701938,0.215989,2.359097,-0.462576,1.688154,0.997445,1.403306,-0.380519,0.342029,-1.145665,0.604032,-0.197266,-0.197266,0.204877,0.204877,0.590593,0.521710,-0.541621
11638,2.216950,1.896075,0.484936,-0.734417,-0.092411,-1.367673,-1.710279,0.937229,-0.318173,-0.422632,-0.389209,-1.655540,2.375114,2.375114,-0.395366,-0.395366,-1.605132,0.538469,1.199663
11639,1.013481,1.984643,-1.464929,-0.797983,0.912321,-1.537441,-1.798193,0.887293,0.554662,0.724359,-0.767437,-1.655540,1.223357,1.223357,0.839096,0.839096,-1.649046,-0.297378,4.716125
11640,-0.189989,-0.642893,0.215989,-0.819171,-0.145292,1.178849,0.241392,-0.227962,1.178116,0.342029,-0.053007,0.604032,-0.262665,-0.262665,0.272829,0.272829,-0.155954,0.907883,-0.497510


In [287]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

model = ExtraTreesClassifier(n_estimators=550,random_state=251, max_depth=20, class_weight='balanced')
# et = ExtraTreesClassifier(class_weight='balanced', random_state=3)
# param = {
#     'n_estimators': [300,400,500,600]
# }
# model = RandomizedSearchCV(et, param, cv=5, n_iter=10, scoring='neg_mean_squared_error')
model.fit(X_train,y_train)
# print('Best parameters ', model.best_params_)
print("RMSE in train: ",np.sqrt(mean_squared_error(y_train,model.predict(X_train))))
print("RMSE in val: ",np.sqrt(mean_squared_error(y_val,model.predict(X_val))))

RMSE in train:  0.2514151417253946
RMSE in val:  0.5096211351891646


In [288]:
y_train,model.predict(X_train)

(5436      6
 3716      5
 12168     6
 8005      6
 866      10
          ..
 5126      5
 722       6
 5450      8
 8297      6
 1095      5
 Name: quality, Length: 9777, dtype: int32,
 array([6, 5, 6, ..., 8, 6, 5]))

In [253]:
test

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type,total_acidity,average_acidity,total_minerals,average_minerals,non_free_sulfur_dioxide,percentage_free_sulfur,percentage_salt_sulfur
0,-0.039555,-0.583847,0.350463,-0.607287,0.145552,-1.084726,-1.288296,-0.131418,1.863915,1.680186,1.628006,-1.655540,-0.110066,-0.110066,1.524278,1.524278,-1.187944,-0.148848,1.057156
1,0.712613,-0.465755,0.484936,0.515702,-0.489016,1.065670,1.648003,0.937229,0.554662,-0.167745,-0.809463,0.604032,0.631128,0.631128,-0.253799,-0.253799,1.644540,-0.414489,-0.527009
2,1.239131,-0.170527,1.157304,-0.797983,-0.198173,-0.745190,0.294140,0.271405,-0.567555,-1.251015,-0.893513,0.604032,1.176124,1.176124,-1.154164,-1.154164,0.656464,-1.273597,-0.283183
3,0.110878,-1.115259,-0.120194,1.765821,-0.039531,0.160240,0.944698,1.153622,-2.001499,-0.613798,-1.481868,0.604032,-0.030134,-0.030134,-0.553921,-0.553921,1.117567,-0.732360,-0.452651
4,-0.641290,-0.406710,1.627961,-0.776794,2.604502,1.065670,-0.145425,-0.414393,-0.754591,-1.059850,-1.061615,0.604032,-0.669595,-0.669595,-0.384041,-0.384041,-0.595098,1.426134,-0.599638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
815,-0.866941,-0.288618,1.627961,-0.564910,-0.303934,0.895902,1.226020,-0.793913,0.554662,0.214585,0.367247,0.604032,-0.873060,-0.873060,0.125599,0.125599,1.183438,-0.297378,-0.476597
816,-0.415640,-0.111481,0.148753,0.261440,0.251313,0.273419,0.733706,0.211481,-0.692246,-0.358911,-0.977564,0.604032,-0.415264,-0.415264,-0.265125,-0.265125,0.810165,-0.505575,-0.442967
817,0.787830,-0.524801,0.350463,-0.840360,0.489276,-1.254495,-1.604784,0.870647,0.242935,1.042968,-0.725412,-1.655540,0.696527,0.696527,1.031626,1.031626,-1.517303,0.496936,1.432681
818,-0.716507,-0.347664,-0.187431,-0.437779,-0.541897,0.103651,0.452384,-1.759357,-0.318173,-1.123572,1.964209,0.604032,-0.734995,-0.734995,-1.114525,-1.114525,0.524721,-0.471371,-0.505609


In [289]:
y_pred = model.predict(test)


In [290]:
temp = test.copy()
# add y_pred to temp
temp['quality'] = y_pred
temp

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type,total_acidity,average_acidity,total_minerals,average_minerals,non_free_sulfur_dioxide,percentage_free_sulfur,percentage_salt_sulfur,quality
0,-0.039555,-0.583847,0.350463,-0.607287,0.145552,-1.084726,-1.288296,-0.131418,1.863915,1.680186,1.628006,-1.655540,-0.110066,-0.110066,1.524278,1.524278,-1.187944,-0.148848,1.057156,7
1,0.712613,-0.465755,0.484936,0.515702,-0.489016,1.065670,1.648003,0.937229,0.554662,-0.167745,-0.809463,0.604032,0.631128,0.631128,-0.253799,-0.253799,1.644540,-0.414489,-0.527009,6
2,1.239131,-0.170527,1.157304,-0.797983,-0.198173,-0.745190,0.294140,0.271405,-0.567555,-1.251015,-0.893513,0.604032,1.176124,1.176124,-1.154164,-1.154164,0.656464,-1.273597,-0.283183,5
3,0.110878,-1.115259,-0.120194,1.765821,-0.039531,0.160240,0.944698,1.153622,-2.001499,-0.613798,-1.481868,0.604032,-0.030134,-0.030134,-0.553921,-0.553921,1.117567,-0.732360,-0.452651,6
4,-0.641290,-0.406710,1.627961,-0.776794,2.604502,1.065670,-0.145425,-0.414393,-0.754591,-1.059850,-1.061615,0.604032,-0.669595,-0.669595,-0.384041,-0.384041,-0.595098,1.426134,-0.599638,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
815,-0.866941,-0.288618,1.627961,-0.564910,-0.303934,0.895902,1.226020,-0.793913,0.554662,0.214585,0.367247,0.604032,-0.873060,-0.873060,0.125599,0.125599,1.183438,-0.297378,-0.476597,7
816,-0.415640,-0.111481,0.148753,0.261440,0.251313,0.273419,0.733706,0.211481,-0.692246,-0.358911,-0.977564,0.604032,-0.415264,-0.415264,-0.265125,-0.265125,0.810165,-0.505575,-0.442967,6
817,0.787830,-0.524801,0.350463,-0.840360,0.489276,-1.254495,-1.604784,0.870647,0.242935,1.042968,-0.725412,-1.655540,0.696527,0.696527,1.031626,1.031626,-1.517303,0.496936,1.432681,6
818,-0.716507,-0.347664,-0.187431,-0.437779,-0.541897,0.103651,0.452384,-1.759357,-0.318173,-1.123572,1.964209,0.604032,-0.734995,-0.734995,-1.114525,-1.114525,0.524721,-0.471371,-0.505609,6


In [291]:
temp1 = temp[((temp['quality'] < 4.8)  & (temp['quality'] > 2.1)) | ((temp['quality'] < 8.9)  & (temp['quality'] > 7.1))].reset_index(drop=True)
temp2 = temp.sample(frac=0.7)
temp

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type,total_acidity,average_acidity,total_minerals,average_minerals,non_free_sulfur_dioxide,percentage_free_sulfur,percentage_salt_sulfur,quality
0,-0.039555,-0.583847,0.350463,-0.607287,0.145552,-1.084726,-1.288296,-0.131418,1.863915,1.680186,1.628006,-1.655540,-0.110066,-0.110066,1.524278,1.524278,-1.187944,-0.148848,1.057156,7
1,0.712613,-0.465755,0.484936,0.515702,-0.489016,1.065670,1.648003,0.937229,0.554662,-0.167745,-0.809463,0.604032,0.631128,0.631128,-0.253799,-0.253799,1.644540,-0.414489,-0.527009,6
2,1.239131,-0.170527,1.157304,-0.797983,-0.198173,-0.745190,0.294140,0.271405,-0.567555,-1.251015,-0.893513,0.604032,1.176124,1.176124,-1.154164,-1.154164,0.656464,-1.273597,-0.283183,5
3,0.110878,-1.115259,-0.120194,1.765821,-0.039531,0.160240,0.944698,1.153622,-2.001499,-0.613798,-1.481868,0.604032,-0.030134,-0.030134,-0.553921,-0.553921,1.117567,-0.732360,-0.452651,6
4,-0.641290,-0.406710,1.627961,-0.776794,2.604502,1.065670,-0.145425,-0.414393,-0.754591,-1.059850,-1.061615,0.604032,-0.669595,-0.669595,-0.384041,-0.384041,-0.595098,1.426134,-0.599638,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
815,-0.866941,-0.288618,1.627961,-0.564910,-0.303934,0.895902,1.226020,-0.793913,0.554662,0.214585,0.367247,0.604032,-0.873060,-0.873060,0.125599,0.125599,1.183438,-0.297378,-0.476597,7
816,-0.415640,-0.111481,0.148753,0.261440,0.251313,0.273419,0.733706,0.211481,-0.692246,-0.358911,-0.977564,0.604032,-0.415264,-0.415264,-0.265125,-0.265125,0.810165,-0.505575,-0.442967,6
817,0.787830,-0.524801,0.350463,-0.840360,0.489276,-1.254495,-1.604784,0.870647,0.242935,1.042968,-0.725412,-1.655540,0.696527,0.696527,1.031626,1.031626,-1.517303,0.496936,1.432681,6
818,-0.716507,-0.347664,-0.187431,-0.437779,-0.541897,0.103651,0.452384,-1.759357,-0.318173,-1.123572,1.964209,0.604032,-0.734995,-0.734995,-1.114525,-1.114525,0.524721,-0.471371,-0.505609,6


In [292]:
temp1

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type,total_acidity,average_acidity,total_minerals,average_minerals,non_free_sulfur_dioxide,percentage_free_sulfur,percentage_salt_sulfur,quality
0,-0.942157,0.272317,-2.137297,-0.84036,-0.409695,-1.311084,-1.04214,-1.14347,0.928734,-0.996128,0.619399,0.604032,-0.876694,-0.876694,-0.972958,-0.972958,-0.792714,-1.293882,0.588367,4
1,-0.490856,-0.40671,-0.254668,0.664021,-0.145292,1.405207,1.700751,0.471153,-0.816936,0.087142,-1.313767,0.604032,-0.524263,-0.524263,0.046322,0.046322,1.578669,-0.210794,-0.537385,4
2,-1.167808,-0.40671,0.283226,-0.755606,-0.436136,0.443187,-0.426748,-1.586243,0.305281,2.827178,1.207753,0.604032,-1.178258,-1.178258,2.41898,2.41898,-0.704885,1.101007,-0.136013,8
3,3.269986,0.71516,0.82112,-0.797983,2.366539,0.330008,-1.11247,1.190242,0.99108,2.062516,-0.47326,-1.65554,3.247107,3.247107,2.339703,2.339703,-1.517303,3.40423,-0.184326,3
4,0.56218,0.597068,0.283226,0.727586,-0.83274,1.12226,0.645793,-0.694039,-0.941627,-1.824511,1.880158,0.604032,0.616595,0.616595,-1.799708,-1.799708,0.37102,0.361336,-0.664486,8
5,-1.468675,1.482755,0.148753,-0.84036,-0.700539,-1.537441,-0.954226,-2.078953,0.305281,-0.996128,2.13231,0.604032,-1.236391,-1.236391,-1.035248,-1.035248,-0.595098,-1.92131,2.428306,4


In [293]:
# add target to train
train['quality'] = y
train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type,total_acidity,average_acidity,total_minerals,average_minerals,non_free_sulfur_dioxide,percentage_free_sulfur,percentage_salt_sulfur,quality
0,0.000000,0.000000,0.064818,0.000000,0.000000,0.000000,0.437043,0.000000,0.126027,0.000000,0.000000,0.604032,0.000000,0.000000,0.000000,0.000000,0.722394,0.000000,0.000000,10
1,0.000000,0.000000,0.000000,0.456539,0.000000,0.260610,0.055943,0.000000,0.000000,0.000000,1.356830,0.604032,0.000000,0.000000,0.000000,0.000000,0.000000,0.084122,0.000000,12
2,0.000000,0.000000,0.264896,0.000000,0.000000,0.968397,0.000000,0.000000,0.000000,0.000000,0.665936,0.604032,0.000000,0.000000,0.000000,0.000000,0.000000,1.100650,0.000000,10
3,0.000000,0.000000,0.147878,1.941932,0.000000,0.940648,0.611547,0.076943,0.998741,0.000000,1.746705,0.609792,0.000000,0.000000,0.000000,0.000000,0.402492,0.167292,0.000000,7
4,0.000000,0.000000,0.332393,0.000000,0.000000,0.441223,0.472506,0.000000,0.109238,0.000000,0.695920,0.604032,0.000000,0.000000,0.000000,0.000000,0.418869,0.000000,0.000000,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12217,0.035662,0.006611,-0.523615,-0.713229,0.251313,-0.122707,-1.165218,0.324671,1.302807,0.023420,-0.389209,-1.655540,0.035266,0.035266,0.074635,0.074635,-1.407516,2.409174,-0.301340,5
12218,0.562180,-0.642893,0.283226,-0.819171,-0.277494,-0.745190,0.258975,0.005076,-0.131137,0.023420,-0.389209,0.604032,0.463996,0.463996,-0.038618,-0.038618,0.612550,-1.257260,0.015879,5
12219,0.486963,-0.524801,0.619410,2.073053,-0.462576,0.443187,1.788664,0.937229,-1.627426,0.405750,-0.389209,0.604032,0.405863,0.405863,0.261503,0.261503,2.061728,-0.898834,-0.390215,6
12220,-0.641290,-0.997167,0.215989,-0.925114,-0.330374,0.499777,-0.110260,-1.226698,0.554662,-1.187294,0.367247,0.604032,-0.742261,-0.742261,-1.125850,-1.125850,-0.331612,0.604806,-0.563457,6


In [294]:
#concat temp1 and temp2 and train
train = pd.concat([train,temp1,temp2],axis=0).reset_index(drop=True)
train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type,total_acidity,average_acidity,total_minerals,average_minerals,non_free_sulfur_dioxide,percentage_free_sulfur,percentage_salt_sulfur,quality
0,0.000000,0.000000,0.064818,0.000000,0.000000,0.000000,0.437043,0.000000,0.126027,0.000000,0.000000,0.604032,0.000000,0.000000,0.000000,0.000000,0.722394,0.000000,0.000000,10
1,0.000000,0.000000,0.000000,0.456539,0.000000,0.260610,0.055943,0.000000,0.000000,0.000000,1.356830,0.604032,0.000000,0.000000,0.000000,0.000000,0.000000,0.084122,0.000000,12
2,0.000000,0.000000,0.264896,0.000000,0.000000,0.968397,0.000000,0.000000,0.000000,0.000000,0.665936,0.604032,0.000000,0.000000,0.000000,0.000000,0.000000,1.100650,0.000000,10
3,0.000000,0.000000,0.147878,1.941932,0.000000,0.940648,0.611547,0.076943,0.998741,0.000000,1.746705,0.609792,0.000000,0.000000,0.000000,0.000000,0.402492,0.167292,0.000000,7
4,0.000000,0.000000,0.332393,0.000000,0.000000,0.441223,0.472506,0.000000,0.109238,0.000000,0.695920,0.604032,0.000000,0.000000,0.000000,0.000000,0.418869,0.000000,0.000000,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12797,-1.092591,-0.524801,-0.523615,0.812340,-0.356815,1.405207,0.663376,0.437861,0.554662,-0.996128,-0.893513,0.604032,-1.120125,-1.120125,-0.961633,-0.961633,0.283191,0.610741,-0.615956,5
12798,0.186095,1.305617,-2.002823,-0.586098,0.515716,-1.084726,-1.393792,0.261418,0.866389,0.533194,0.283196,-1.655540,0.340463,0.340463,0.584276,0.584276,-1.319687,0.224600,0.641189,5
12799,-0.641290,-1.056213,0.014279,-0.628475,-0.251053,0.612955,1.507342,-0.327836,1.739224,-0.231467,-0.641361,0.604032,-0.749528,-0.749528,-0.259462,-0.259462,1.644540,-0.662763,-0.481584,6
12800,-0.490856,-0.465755,0.619410,-0.861548,-0.145292,-0.801779,0.487549,0.104949,1.240461,-0.422632,-0.389209,0.604032,-0.531530,-0.531530,-0.406691,-0.406691,0.919951,-1.412402,-0.044868,6


In [283]:
result = pd.DataFrame({'id':id_list,'quality':y_pred})
result.to_csv(DIR_RESULT + 'last_day_submission.csv',index=False)

## Predict each type

In [33]:
train_red = train[train['type']== 0]
train_white = train[train['type']== 1]
test_red = test[test['type']== 0]
test_white = test[test['type']== 1]

In [34]:
#extract target, y
target = 'quality'
y_red = train_red[target]
y_white = train_white[target]
train_red.drop(target,axis=1,inplace=True)
train_white.drop(target,axis=1,inplace=True)

In [35]:
#drop id red wine
id_list_red = test_red['id']
test_red.drop('id',axis=1,inplace=True)
#drop id white wine
id_list_white = test_white['id']
test_white.drop('id',axis=1,inplace=True)

In [36]:
#get data for red wine train and val
X_train_red, X_val_red, y_train_red, y_val_red = train_test_split(train_red, y_red, test_size=0.2, random_state=198)
#get data for white wine train and val
X_train_white, X_val_white, y_train_white, y_val_white = train_test_split(train_white, y_white, test_size=0.2, random_state=251)

In [37]:
from sklearn.ensemble import ExtraTreesRegressor
model_red = ExtraTreesRegressor(n_estimators=550,random_state=251,max_depth=20)
model_red.fit(X_train_red,y_train_red)
print("RMSE in train: ",np.sqrt(mean_squared_error(y_train_red,model.predict(X_train_red))))
print("RMSE in val: ",np.sqrt(mean_squared_error(y_val_red,model.predict(X_val_red))))

RMSE in train:  0.22361751555000192
RMSE in val:  0.23171846696333204


In [38]:
y_pred_red = model.predict(test_red)
result_red = pd.DataFrame({'id':id_list_red,'quality':y_pred_red})

In [39]:
model_white = ExtraTreesRegressor(n_estimators=550,random_state=251,max_depth=20)
model_white.fit(X_train_white,y_train_white)
print("RMSE in train: ",np.sqrt(mean_squared_error(y_train_white,model.predict(X_train_white))))
print("RMSE in val: ",np.sqrt(mean_squared_error(y_val_white,model.predict(X_val_white))))

RMSE in train:  0.3176214216826995
RMSE in val:  0.3194680300482412


In [40]:
y_pred_white = model.predict(test_white)
result_white = pd.DataFrame({'id':id_list_white,'quality':y_pred_white})

In [41]:
submission = pd.concat([result_red, result_white])
submission.to_csv(DIR_RESULT + 'submission_drop-duplicate_merge-norm_predict-each.csv',index=False)

## XGB

In [186]:
# using xgboost to train
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators=550,random_state=251,max_depth=20,learning_rate=0.1,alpha=0.1,subsample=0.8,colsample_bytree=0.8,reg_lambda=0.1,reg_alpha=0.1,objective='reg:squarederror')
model.fit(X_train,y_train)
print("RMSE in train: ",np.sqrt(mean_squared_error(y_train,model.predict(X_train))))
print("RMSE in val: ",np.sqrt(mean_squared_error(y_val,model.predict(X_val))))


RMSE in train:  0.00997654287949098
RMSE in val:  0.5666385023829809


In [182]:
y_train,model.predict(X_train)

(5230    6.0
 2765    6.0
 3372    7.0
 3146    6.0
 3818    6.0
        ... 
 371     7.0
 4559    6.0
 2985    7.0
 1785    5.0
 4644    6.0
 Name: quality, Length: 6932, dtype: float64,
 array([6.0021434, 5.99932  , 6.9994483, ..., 6.9990606, 4.9993   ,
        5.9972205], dtype=float32))

In [187]:
y_pred = model.predict(test)
result = pd.DataFrame({'id':id_list,'quality':y_pred})
result.to_csv(DIR_RESULT + 'submission_xgb_fix3.csv',index=False)