##MODEL TRAINING


In [26]:
import pandas as pd


In [27]:
df = pd.read_csv('./data/test.csv')
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [28]:
df.rename(columns = {'fixed acidity':'fixed_acidity', 'volatile acidity':'volatile_acidity',
                              'citric acid':'citric_acid',"residual sugar":"residual_sugar","free sulfur dioxide":"free_sulphor_dioxide","total sulfur dioxide":"total_sulfur_dioxide"}, inplace = True)

In [29]:
## Independent and dependent features
X = df.drop(labels=['quality'],axis=1)
Y = df[['quality']]

In [30]:
Y

Unnamed: 0,quality
0,6
1,6
2,6
3,6
4,6
...,...
6492,5
6493,6
6494,6
6495,5


In [31]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [32]:
df.type.unique()

array(['white', 'red'], dtype=object)

In [33]:
# Define the custom ranking for each ordinal variable
type_categories = ['white', 'red']

In [34]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [35]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[type_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])


In [36]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [37]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [38]:
X_train.head()

Unnamed: 0,num_pipeline__fixed_acidity,num_pipeline__volatile_acidity,num_pipeline__citric_acid,num_pipeline__residual_sugar,num_pipeline__chlorides,num_pipeline__free_sulphor_dioxide,num_pipeline__total_sulfur_dioxide,num_pipeline__density,num_pipeline__pH,num_pipeline__sulphates,num_pipeline__alcohol,cat_pipeline__type
0,0.541708,2.135508,-0.744281,-0.713091,0.709209,0.126376,0.429656,0.496332,0.199761,-0.135588,-0.498398,1.788441
1,0.464337,3.804681,1.918438,12.485528,0.535295,-1.285561,0.766692,14.612507,1.065879,1.111163,1.011022,-0.559146
2,-0.928342,-0.595866,-0.266357,-0.329772,-0.50819,1.707745,0.305485,-1.067733,0.509089,0.072204,0.675595,-0.559146
3,-1.54731,-1.020746,-0.334632,-1.003171,-0.740075,-1.116129,-0.865271,-0.830153,0.509089,-0.828227,-0.582255,-0.559146
4,0.773821,-0.110288,0.68949,-0.671651,1.230951,-1.567949,-1.911856,0.120164,0.323492,0.141468,1.514162,1.788441


In [39]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [40]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [41]:
regression.coef_

array([[ 0.11855424, -0.24880347, -0.02193751,  0.31968515, -0.03611843,
         0.09226003, -0.09602087, -0.33547072,  0.07412869,  0.11728246,
         0.24839772,  0.16035109]])

In [42]:
regression.intercept_

array([5.81130416])

In [43]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [44]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 0.7565186500020442
MAE: 0.580945778407853
R2 score 28.862539329913904


Lasso
Model Training Performance
RMSE: 0.8972641646919505
MAE: 0.705006964298805
R2 score -0.06903847804164975


Ridge
Model Training Performance
RMSE: 0.7565049097139476
MAE: 0.5809512858604243
R2 score 28.86512337811309


Elasticnet
Model Training Performance
RMSE: 0.8972641646919505
MAE: 0.705006964298805
R2 score -0.06903847804164975




In [45]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet']

In [48]:
df.shape

(6497, 13)