## import libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_diabetes
#from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import matplotlib.pyplot as plt
#import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

#### mlflow

In [2]:
import mlflow
mlflow.sklearn.autolog

#mlflow ui
#enable autologging
mlflow.sklearn.autolog()
mlflow.start_run()

# with mlflow.start_run(run_name= model_name, nested=True) as run:
    # df = preprocess_df(target_csv)
    # train_model(df,model_name)
    # test_model(df,model_name)



The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



<ActiveRun: >

In [3]:
mlflow.end_run()

## 1. Problem Statement

In [4]:
To predict the car price using various variables

Dependent Variable: Price
    
Independent Variables:
    
        ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg']

SyntaxError: invalid syntax (948401580.py, line 1)

### 2. Data Gathering

In [None]:
df = pd.read_csv('autos_dataset.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
symboling,3,3,1,2,2
normalized-losses,?,?,?,164,164
make,alfa-romero,alfa-romero,alfa-romero,audi,audi
fuel-type,gas,gas,gas,gas,gas
aspiration,std,std,std,std,std
num-of-doors,two,two,two,four,four
body-style,convertible,convertible,hatchback,sedan,sedan
drive-wheels,rwd,rwd,rwd,fwd,4wd
engine-location,front,front,front,front,front
wheel-base,88.6,88.6,94.5,99.8,99.4


In [None]:
df.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

## 3.Exploratory Data Analysis

In [None]:
df.isna().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [None]:
df

In [None]:
df.replace({"?":np.nan},inplace = True)

In [None]:
df.isna().sum()

In [None]:
df.isna().mean() * 100

symboling             0.00000
normalized-losses    20.00000
make                  0.00000
fuel-type             0.00000
aspiration            0.00000
num-of-doors          0.97561
body-style            0.00000
drive-wheels          0.00000
engine-location       0.00000
wheel-base            0.00000
length                0.00000
width                 0.00000
height                0.00000
curb-weight           0.00000
engine-type           0.00000
num-of-cylinders      0.00000
engine-size           0.00000
fuel-system           0.00000
bore                  1.95122
stroke                1.95122
compression-ratio     0.00000
horsepower            0.97561
peak-rpm              0.97561
city-mpg              0.00000
highway-mpg           0.00000
price                 1.95122
dtype: float64

### 3.1. symboling             

In [None]:
df['symboling']

In [None]:
df['symboling'].value_counts()

### 3.2. Normalized-losses

In [None]:
df['normalized-losses'].head(30)

0     NaN
1     NaN
2     NaN
3     164
4     164
5     NaN
6     158
7     NaN
8     158
9     NaN
10    192
11    192
12    188
13    188
14    NaN
15    NaN
16    NaN
17    NaN
18    121
19     98
20     81
21    118
22    118
23    118
24    148
25    148
26    148
27    148
28    110
29    145
Name: normalized-losses, dtype: object

In [None]:
df.describe()

In [None]:
df['normalized-losses'] = df['normalized-losses'].astype(float)

In [None]:
df['normalized-losses'].mean()

In [None]:
df['normalized-losses'].median()

In [None]:
df['normalized-losses']=df['normalized-losses'].fillna(df['normalized-losses'].median())

In [None]:
# df.info()

### 3.3 make               

In [None]:
df['make']

In [None]:
df['make'].value_counts()

### 3.4 fuel-Type 

In [None]:
df['fuel-type']

In [None]:
df['fuel-type'].value_counts()

In [None]:
df['fuel-type'].replace({'gas':1,'diesel': 0},inplace = True)

In [None]:
df['fuel-type']

### 3.5 aspiration 

In [None]:
df['aspiration'].value_counts()

In [None]:
df['aspiration'].replace({'std':0,'turbo':1},inplace = True)

In [None]:
df['aspiration']

In [None]:
# df.info()

### 3.6 num-of-doors

In [None]:
df['num-of-doors'].value_counts()

In [None]:
df['num-of-doors'].value_counts().to_dict()

In [None]:
df['num-of-doors'].replace({'four': 4, 'two': 2},inplace = True)

In [None]:
df['num-of-doors'].fillna(df['num-of-doors'].mode()[0],inplace= True)

In [None]:
df['num-of-doors'] = df['num-of-doors'].astype(int)
df['num-of-doors']

0      2
1      2
2      2
3      4
4      4
      ..
200    4
201    4
202    4
203    4
204    4
Name: num-of-doors, Length: 205, dtype: int32

In [None]:
# df.isna().sum()

### 3.7 body-style

In [None]:
df['body-style'].value_counts()

In [None]:
df = pd.get_dummies(df,columns=['body-style'])

In [None]:
# df.info()

### 3.8 drive-wheels`

In [None]:
df['drive-wheels'].value_counts()

In [None]:
df['drive-wheels'].value_counts().to_dict()

In [None]:
df['drive-wheels'].replace({'fwd': 1, 'rwd': 0, '4wd': 2},inplace = True)

In [None]:
df['drive-wheels']

In [None]:
# df.info()

### 3.9 engine-location

In [None]:
df['engine-location'].value_counts()

In [None]:
df['engine-location'].replace({'front':1,'rear':0},inplace = True)

In [None]:
df['engine-location']

### 3.10 wheel-base

In [None]:
df['wheel-base']

In [None]:
df.info()

### 3.13 engine-type

In [None]:
df['engine-type'].value_counts()

In [None]:
df['engine-type'].value_counts()

In [None]:
# df.info()

In [None]:
df= pd.get_dummies(df, columns=['engine-type'])

In [None]:
# df.info()

### 3.15 num-of-cylinders

In [None]:
df['num-of-cylinders']

In [None]:
df['num-of-cylinders'].value_counts()

In [None]:
df['num-of-cylinders'].value_counts().to_dict()

In [None]:
df['num-of-cylinders'].replace({'four': 4,
                                 'six': 6,
                                 'five': 5,
                                 'eight': 8,
                                 'two': 2,
                                 'three': 3,
                                 'twelve': 12},inplace = True)

In [None]:
df['num-of-cylinders']

In [None]:
df.info()

### 3.15 fuel-system

In [None]:
df['fuel-system'].value_counts()

In [None]:
df = pd.get_dummies(df, columns=['fuel-system'])


In [None]:
# df.info()

### 3.15 bore                    

In [None]:
df['bore']

In [None]:
df['bore'].median()

In [None]:
# df['bore'].mean()

In [None]:
df['bore'] = df['bore'].fillna(df['bore'].median())


In [None]:
df['bore'] = df['bore'].astype(float)

In [None]:
df[['bore']].describe()

### 3.16 stroke

In [None]:
df['stroke'] = df['stroke'].astype(float)
df[['stroke']].describe()

Unnamed: 0,stroke
count,201.0
mean,3.255423
std,0.316717
min,2.07
25%,3.11
50%,3.29
75%,3.41
max,4.17


In [None]:
df['stroke'] = df['stroke'].fillna(df['stroke'].median())


In [None]:
3.

In [None]:
df.info()

### 3.18 horsepower

In [None]:
df['horsepower'] = df['horsepower'].astype(float)
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].mean())
df['horsepower'] = df['horsepower'].astype(int)
df['horsepower']

0      111
1      111
2      154
3      102
4      115
      ... 
200    114
201    160
202    134
203    106
204    114
Name: horsepower, Length: 205, dtype: int32

### 3.19 peak-rpm

In [None]:
df['peak-rpm'] = df['peak-rpm'].astype(float)
df['peak-rpm'] = df['peak-rpm'].fillna(df['peak-rpm'].mean())
df['peak-rpm'] = df['peak-rpm'].astype(int)
df['peak-rpm']

0      5000
1      5000
2      5000
3      5500
4      5500
       ... 
200    5400
201    5300
202    5500
203    4800
204    5400
Name: peak-rpm, Length: 205, dtype: int32

In [None]:
sns.kdeplot(df['peak-rpm'])

In [None]:
df['peak-rpm'].skew()

0.07361410577798844

### 3.20 price 

In [None]:
df['price'] = df['price'].astype(float)
df['price'] = df['price'].fillna(df['price'].mean())
df['price'] = df['price'].astype(int)
df['price']

0      13495
1      16500
2      16500
3      13950
4      17450
       ...  
200    16845
201    19045
202    21485
203    22470
204    22625
Name: price, Length: 205, dtype: int32

In [None]:
df.info()

## 5. Feature Selection

In [None]:
1. Correlation:
    Cont to Cont >> Pearson Coor
    Cat Vs Cat >> Chi-Square Test
    Cont Vs Cat >> ANOVA 
    Cat Vs Cont >> ANOVA
    
2. VIF:
    1 to 10

In [None]:
df.corr().loc['price'].head(22).sort_values().head(4).index

Index(['highway-mpg', 'city-mpg', 'drive-wheels', 'engine-location'], dtype='object')

In [None]:
df.corr().loc['price'].head(22).sort_values().tail(9).index

Index(['bore', 'wheel-base', 'length', 'num-of-cylinders', 'width',
       'horsepower', 'curb-weight', 'engine-size', 'price'],
      dtype='object')

#### 5.2 VIF

In [None]:
variance_inflation_factor()

### 6. Model Training

#### 6.1 Train Test Data

In [None]:
x = df.drop(['price','make'],axis = 1)
y = df['price']
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=32, test_size=0.2)

In [None]:
x = df[['bore', 'wheel-base', 'length', 'num-of-cylinders', 'width',
       'horsepower', 'curb-weight', 'engine-size','highway-mpg', 'city-mpg',
             'drive-wheels', 'engine-location']]
y = df['price']

x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=32, test_size=0.2)

In [None]:
linear_reg_model = LinearRegression()
linear_reg_model.fit(x_train, y_train)

### 7. Evaluation

In [None]:
# Training Data Evaluation

y_pred_train = linear_reg_model.predict(x_train)
y_pred_train[10:15]

array([ 7432.53672208, 17284.35935489, 10242.33540148, 20774.91127793,
       34599.23591301])

In [None]:
y_train[10:15]

33      6529
200    16845
188     9995
14     24565
128    37028
Name: price, dtype: int32

In [None]:
# Training Data Evaluation

y_pred_train = linear_reg_model.predict(x_train)
mse = mean_squared_error(y_train,y_pred_train )
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train,y_pred_train)
print("MAE :",mae)

r_squared = r2_score(y_train,y_pred_train)
print("R2 Score :",r_squared)

MSE : 9813965.506582592
RMSE : 3132.724933118545
MAE : 2228.617594735046
R2 Score : 0.8546116874977614


In [None]:
# Testing Data Evaluation

y_pred = linear_reg_model.predict(x_test)
mse = mean_squared_error(y_test,y_pred )
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test,y_pred)
print("MAE :",mae)

r_squared = r2_score(y_test,y_pred)
print("R2 Score :",r_squared)

MSE : 11265103.078033755
RMSE : 3356.352645064841
MAE : 2415.3363077218132
R2 Score : 0.6556765531870743


### Ridge 

In [None]:
ridge_model = Ridge(alpha= 0.15)
ridge_model.fit(x_train, y_train)

In [None]:
# Training Data Evaluation

y_pred_train = ridge_model.predict(x_train)
mse = mean_squared_error(y_train,y_pred_train )
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train,y_pred_train)
print("MAE :",mae)

r_squared = r2_score(y_train,y_pred_train)
print("R2 Score :",r_squared)

MSE : 9830571.66892586
RMSE : 3135.3742470279144
MAE : 2224.1461589348505
R2 Score : 0.8543656766554972


In [None]:
# Testing Data Evaluation

y_pred = ridge_model.predict(x_test)
mse = mean_squared_error(y_test,y_pred )
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test,y_pred)
print("MAE :",mae)

r_squared = r2_score(y_test,y_pred)
print("R2 Score :",r_squared)

MSE : 11256798.839214236
RMSE : 3355.115324279366
MAE : 2415.4411489029662
R2 Score : 0.6559303763535103


### Lasso

In [None]:
lasso_model = Lasso(alpha= 0.05)
lasso_model.fit(x_train, y_train)



In [None]:
# Training Data Evaluation

y_pred_train = lasso_model.predict(x_train)
mse = mean_squared_error(y_train,y_pred_train )
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train,y_pred_train)
print("MAE :",mae)

r_squared = r2_score(y_train,y_pred_train)
print("R2 Score :",r_squared)

MSE : 9813965.91078089
RMSE : 3132.724997630799
MAE : 2228.5460193346357
R2 Score : 0.8546116815097937


In [None]:
# Testing Data Evaluation

y_pred = lasso_model.predict(x_test)
mse = mean_squared_error(y_test,y_pred )
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test,y_pred)
print("MAE :",mae)

r_squared = r2_score(y_test,y_pred)
print("R2 Score :",r_squared)

MSE : 11264679.288558297
RMSE : 3356.2895120293624
MAE : 2415.2202487485524
R2 Score : 0.6556895065219797


In [None]:
0.5 M to 2M

In [None]:
POC >> 50k to 150k

class1  >> 40000
class0  >> 10000

    Male Female
M     1    0 
F     0    1 
M     1    0
M     1    0
F     0    1

drop_first = True,

2 >> two
6 >> six
8 >> eight
12 >> twelve


In [None]:
# with mlflow.start_run(run_name= lasso_model, nested=True) as run:
#     df = preprocess_df(target_csv)
#     train_model(df,lasso_model)
#     test_model(df,lasso_model)

In [None]:
ridge_reg_model = Ridge()

hyperparameters = {"alpha" : np.arange(0.01, 5, 0.01)}
gscv_ridge_model = GridSearchCV(ridge_reg_model, hyperparameters, cv = 5)
gscv_ridge_model.fit(x_train, y_train)
print(gscv_ridge_model.best_estimator_)

ridge_model = Ridge(alpha=1.1)
ridge_model.fit(x_train, y_train)

# Testing Data Evaluation
y_pred = ridge_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error :",mse)

rmse = np.sqrt(mse)
print("Root Mean Squared Error :",rmse)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error :",mae)

r_squared = r2_score(y_test, y_pred)
print("R Squared Value is :",r_squared)

       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22,
       0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33,
       0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44,
       0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54, 0.55,
       0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66,
       0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, ...`
2022/11/16 15:38:06 INFO mlflow.sklearn.utils: Logging the 5 best runs, 494 runs will be omitted.


Ridge(alpha=0.03)
Mean Squared Error : 11313151.639019659
Root Mean Squared Error : 3363.5028822671848
Mean Absolute Error : 2421.25573549015
R Squared Value is : 0.6542079251577998
