In [3]:
import numpy as np
import pandas as pd
import joblib

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.model_selection import cross_val_score, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.utils import estimator_html_repr
from sklearn import set_config
set_config(display='diagram')

In [5]:
## reading the csv file
df = pd.read_csv('Data/Fish.csv')

In [6]:
## understanding the basic info about the data

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Weight   159 non-null    float64
 2   Length1  159 non-null    float64
 3   Length2  159 non-null    float64
 4   Length3  159 non-null    float64
 5   Height   159 non-null    float64
 6   Width    159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


In [7]:
df.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [8]:

df.shape

(159, 7)

In [9]:
df.nunique()

Species      7
Weight     101
Length1    116
Length2     93
Length3    124
Height     154
Width      152
dtype: int64

In [10]:
## Checking for missing values in the data

df.isnull().sum() 

Species    0
Weight     0
Length1    0
Length2    0
Length3    0
Height     0
Width      0
dtype: int64

Even if there are no missing values in this data, we are going to implement imputers.This is because the new or future data that we might use for retraining the model can have missing values

In [11]:
## Separating independent and dependent features

Y = pd.DataFrame(df['Weight'],columns=['Weight'])

X = df.drop(columns=['Weight'])


In [12]:
## getting the names of the numerical and categorical features

cat_feat = [feature for feature in X.columns if X[feature].dtypes == 'O']

num_feat = [feature for feature in X.columns if feature not in cat_feat]

In [13]:
## splitting the dataset into training and test set

x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=1232)

creating a pipeline for preprocessing of data

## VERY IMPORTANT LINE OF CODE

In [14]:
## we are creating this as the SimpleImputer method doesn't have get_feature_names_out method
SimpleImputer.get_feature_names_out = (lambda self, names=None:
                                       self.feature_names_in_)

In [15]:
cat_pipe = Pipeline([
            
            ('cat_imputer',SimpleImputer(strategy="most_frequent")),
            ('one_hot_encoder',OneHotEncoder())
    
        ])
 
num_pipe = Pipeline([
     
             ('num_imputer',SimpleImputer(strategy="median")),
             ('std_scalar',StandardScaler())
        ])


In [16]:
preprocessing_pipe = ColumnTransformer([
            
            ('cat',cat_pipe,cat_feat),
            ('num',num_pipe,num_feat)
    
        ],remainder="passthrough", verbose_feature_names_out=False)

In [17]:
## Creating a pipeline for model training 

model_pipe1 = Pipeline([

        ('model1',Ridge(random_state=123))
])

model_pipe2 = Pipeline([

        ('model2',Lasso(random_state=124))
])

model_pipe3 = Pipeline([
    
            ("model3",ElasticNet(random_state=243))    
    
])

In [18]:
## Combining the pipelines for preprocessing and model training
full_pipeline1 = Pipeline([
            
            ('preprocessing',preprocessing_pipe),
            ('model_training1',model_pipe1)
    
    ])

full_pipeline2 = Pipeline([
            
            ('preprocessing',preprocessing_pipe),
            ('model_training2',model_pipe2)
    
    ])

full_pipeline3 = Pipeline([
            
            ('preprocessing',preprocessing_pipe),
            ('model_training3',model_pipe3)
    
    ])

In [19]:
full_pipeline1

In [20]:
## saving the full_pipeline1 diagram as a html file

with open('pipeline_vizualization/Ridge_regression_pipeline.html','w',encoding="utf-8") as f:
    f.write(estimator_html_repr(full_pipeline1))

In [21]:
full_pipeline2

In [22]:
## saving the full_pipeline2 diagram as a html file

with open('pipeline_vizualization/Lasso_regression_pipeline.html','w',encoding="utf-8") as f:
    f.write(estimator_html_repr(full_pipeline2))

In [23]:
full_pipeline3

In [24]:
## saving the full_pipeline3 diagram as a html file

with open('pipeline_vizualization/ElasticNet_regression_pipeline.html','w',encoding="utf-8") as f:
    f.write(estimator_html_repr(full_pipeline3))

In [25]:
## Using cross validation to find the best model out of three 

kf = KFold(n_splits=5,shuffle=True,random_state=987)
score1 = cross_val_score(full_pipeline1,x_train,y_train,cv=kf,scoring='r2').mean()
score2 = cross_val_score(full_pipeline2,x_train,y_train,cv=kf,scoring='r2').mean()
score3 = cross_val_score(full_pipeline3,x_train,y_train,cv=kf,scoring='r2').mean()

print(f"R squared score for the ridge regression on training set is {np.round(score1,3)}")
print(f"R squared score for the lasso regression on training set is {np.round(score2,3)}")
print(f"R squared score for the elasticnet regression on training set is {np.round(score3,3)}")

R squared score for the ridge regression on training set is 0.908
R squared score for the lasso regression on training set is 0.91
R squared score for the elasticnet regression on training set is 0.851


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Since the lasso regression is giving the highest score, we will use that to train our model

In [26]:
## getting the parameters that can be tuned for full_pipeline2

full_pipeline2.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessing', 'model_training2', 'preprocessing__n_jobs', 'preprocessing__remainder', 'preprocessing__sparse_threshold', 'preprocessing__transformer_weights', 'preprocessing__transformers', 'preprocessing__verbose', 'preprocessing__verbose_feature_names_out', 'preprocessing__cat', 'preprocessing__num', 'preprocessing__cat__memory', 'preprocessing__cat__steps', 'preprocessing__cat__verbose', 'preprocessing__cat__cat_imputer', 'preprocessing__cat__one_hot_encoder', 'preprocessing__cat__cat_imputer__add_indicator', 'preprocessing__cat__cat_imputer__copy', 'preprocessing__cat__cat_imputer__fill_value', 'preprocessing__cat__cat_imputer__missing_values', 'preprocessing__cat__cat_imputer__strategy', 'preprocessing__cat__cat_imputer__verbose', 'preprocessing__cat__one_hot_encoder__categories', 'preprocessing__cat__one_hot_encoder__drop', 'preprocessing__cat__one_hot_encoder__dtype', 'preprocessing__cat__one_hot_encoder__handle_unknown', 'preprocessin

In [27]:
tuning_parameters = {}
tuning_parameters['model_training2__model2__alpha'] = np.arange(1,10,0.5)
tuning_parameters['model_training2__model2__selection'] = ['cyclic','random']

In [28]:
hyperparameter_tuning_results = RandomizedSearchCV(full_pipeline2,tuning_parameters,scoring='r2',random_state=764)

In [29]:
tuning_results = hyperparameter_tuning_results.fit(x_train,y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [30]:
tuning_results_df = pd.DataFrame(tuning_results.cv_results_)

In [31]:
tuning_results_df[['param_model_training2__model2__alpha','param_model_training2__model2__selection','mean_test_score','rank_test_score']]

Unnamed: 0,param_model_training2__model2__alpha,param_model_training2__model2__selection,mean_test_score,rank_test_score
0,9.5,random,0.898843,10
1,5.0,cyclic,0.914548,5
2,3.0,cyclic,0.922627,3
3,1.5,random,0.928475,1
4,8.5,random,0.902982,8
5,8.0,cyclic,0.904501,7
6,2.5,random,0.924599,2
7,3.5,random,0.920138,4
8,6.5,cyclic,0.90965,6
9,9.0,random,0.900962,9


In [32]:
tuning_results.best_params_

{'model_training2__model2__selection': 'random',
 'model_training2__model2__alpha': 1.5}

In [33]:
full_pipeline2['model_training2'].named_steps['model2'].set_params(selection='random',alpha=1.5)

In [34]:
## Training the model using the new parameters
lasso_model = full_pipeline2.fit(x_train,y_train)

  model = cd_fast.enet_coordinate_descent(


In [35]:
y_predict = lasso_model.predict(x_test)

In [36]:
y_predict

array([  41.94209151,  390.01083479,   56.68105437,  859.54386599,
        581.59964963,  606.78473379, -398.45169809,  308.58194558,
        716.67090564,  521.52277276,  346.7390952 ,  131.49434729,
        841.42386677,   97.34893858,  -90.85031701,  607.1945575 ,
        550.17497155,   20.36773618,  270.16370112,  420.77591585,
        827.45561873,  225.81536572,  739.44581792,  868.96094254,
       -112.40927693,   90.64202669,  532.79014785,  245.72002643,
        739.48800996,   29.32950097,  367.06195524,  878.34655737])

In [37]:
### Calculating the accuracy of the model using r2_score

r2_score(y_test,y_predict)

0.8911295520294069

Saving the created model into the python pickle file

In [38]:
joblib.dump(lasso_model,"Model_pickle_file/lasso_model.pkl")

['Model_pickle_file/lasso_model.pkl']