## Import all required libraries 

In [29]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import train_test_split,GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import pickle
import json


# Step 1: Problem Statement 

# Step 2: Data Gathering 

In [30]:
df = pd.read_csv('autos_dataset.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
symboling,3,3,1,2,2
normalized-losses,?,?,?,164,164
make,alfa-romero,alfa-romero,alfa-romero,audi,audi
fuel-type,gas,gas,gas,gas,gas
aspiration,std,std,std,std,std
num-of-doors,two,two,two,four,four
body-style,convertible,convertible,hatchback,sedan,sedan
drive-wheels,rwd,rwd,rwd,fwd,4wd
engine-location,front,front,front,front,front
wheel-base,88.6,88.6,94.5,99.8,99.4


# Step 3: EDA [Exploratory Data Analysis]

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [32]:
## After replacing ? with null values 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

###  3.1 symboling

In [33]:
df['symboling']

In [34]:
df['symboling'].unique()

In [35]:
### Need confirmation discussion on this columns 

### 3.2 normalized-losses

In [36]:
df['normalized-losses']

In [37]:
df['normalized-losses'].nunique()
df['normalized-losses'].value_counts()


In [38]:
## this need to convert into int 
## Missing Values found >> Need to handle 

### 3.3 make

In [39]:
df['make'].dtype

In [40]:
df['make'].nunique()

In [41]:
df['make'].unique() ### Nominal Categorical data found >> Solution :: One hot encoding 

### 3.4 fuel-type

In [42]:
df['fuel-type'].dtype

In [43]:
df['fuel-type'].nunique()

In [44]:
df['fuel-type'].value_counts()  ### Categorical [Nominal data]

In [45]:
df['fuel-type'].value_counts().to_dict()

### 3.5 aspiration

In [46]:
df.aspiration.dtype

In [47]:
df.aspiration.nunique()

In [48]:
df.aspiration.value_counts()  ### Categorical [Nominal]

In [49]:
df.aspiration.value_counts().to_dict()

# Step 4: Feature Engineering 

##### replace questions mark (?) by null values 

In [50]:
df = df.replace({"?":np.nan})

In [51]:
df['normalized-losses'] = df['normalized-losses'].astype(float)

In [52]:
## Handling the missing values for  normalized-losses
df['normalized-losses'].mean()
df['normalized-losses'].median()
df['normalized-losses'].mode()[0]


In [53]:
sns.kdeplot(df['normalized-losses'])

In [54]:
df['normalized-losses'].fillna(df['normalized-losses'].median(), inplace=True)

In [55]:
df['normalized-losses'].isna().sum()

In [56]:
df['normalized-losses'].dtype

In [57]:
df['normalized-losses'] = df['normalized-losses'].astype(int)

In [58]:
df['normalized-losses'].dtype

### 4.3 make 

In [59]:
df = pd.get_dummies(df,columns=['make'])

In [60]:
df.info()

### 4.4 fuel-type

In [61]:
df['fuel-type'].replace({'gas': 1, 'diesel': 0}, inplace=True)

In [62]:
df['fuel-type'].dtype

In [63]:
df['fuel-type'].isna().sum()

### 4.5 aspiration

In [64]:
df.aspiration.replace({'std': 0, 'turbo': 1}, inplace=True)

In [65]:
df.aspiration.isna().sum()

In [66]:
df.aspiration.dtype

In [67]:
df.columns

## 3.6 num-of-doors

In [68]:
df['num-of-doors'].dtype

In [69]:
df['num-of-doors'].nunique()

In [70]:
df['num-of-doors'].value_counts() ### Categorical [Ordinal ]

In [71]:
df['num-of-doors'].value_counts().to_dict()

In [72]:
df['num-of-doors'].replace({'four': 4, 'two': 2}, inplace=True)

In [73]:
df['num-of-doors'].isna().sum()

In [74]:
df['num-of-doors'].fillna(df['num-of-doors'].mode()[0], inplace=True)

In [75]:
df['num-of-doors'].isna().sum()

In [76]:
df['num-of-doors'].dtype

In [77]:
df['num-of-doors'] = df['num-of-doors'].astype(int)

In [78]:
df['num-of-doors'].dtype

### 3.7 body-style

In [79]:
df['body-style'].dtype

In [80]:
df['body-style'].value_counts() ### Categorical [Nominal Data] >> one hot 

In [81]:
df = pd.get_dummies(df, columns=['body-style'])

In [82]:
# df.info()

### 3.8 drive-wheels

In [83]:
df['drive-wheels'].value_counts() ## >> Categorical nominal data  

In [84]:
df = pd.get_dummies(df, columns=['drive-wheels'])

In [85]:
df.info()

### 3.9 engine-location

In [86]:
df['engine-location'].value_counts().to_dict()

In [87]:
df['engine-location'].replace({'front': 0, 'rear': 1}, inplace=True)

In [88]:
df['engine-location'].dtype

In [89]:
df['engine-location'].isna().sum()

### 3.10  wheel-base

In [90]:
df['wheel-base']

In [91]:
sns.kdeplot(df['wheel-base'])

In [92]:
sns.boxplot(df['wheel-base'])

In [93]:
sns.boxplot(x = df['wheel-base'])

### 3.11 engine-type

In [94]:
df['engine-type'].value_counts()

In [95]:
df = pd.get_dummies(df,columns=['engine-type'])

In [96]:
# df.info()

### 3.12 num-of-cylinders

In [97]:
df['num-of-cylinders'].value_counts() ### Categorcial Oridnal ## label encoding 

In [98]:
df['num-of-cylinders'].value_counts().to_dict()

In [99]:
df['num-of-cylinders'].replace({'four': 4,
 'six': 6,
 'five': 5,
 'eight': 8,
 'two': 2,
 'three': 3,
 'twelve': 12}, inplace=True)

In [100]:
df.info()

### 3.13 fuel-system

In [101]:
df['fuel-system'].value_counts()

In [102]:
df = pd.get_dummies(df, columns = ['fuel-system'])

### 3.14 bore 

In [103]:
# df['bore'].value_counts()

In [104]:
df['bore'] = df['bore'].astype(float)

In [105]:
df['bore'].isna().sum()

In [106]:
sns.kdeplot(df['bore'])

In [107]:
df['bore'].fillna(df['bore'].median(), inplace=True)

In [108]:
# df.info()

### 3.15 stroke

In [109]:
# df.stroke.value_counts()

In [110]:
df.stroke.isna().sum()

In [111]:
df.stroke.fillna(df.stroke.median(), inplace=True)

In [112]:
df.stroke.isna().sum()

In [113]:
df.stroke = df.stroke.astype(float)

### 3.16 horsepower

In [114]:
df['horsepower']

In [115]:
df['horsepower'].isna().sum()

In [116]:
df['horsepower'].median()

In [117]:
df['horsepower'].fillna(df['horsepower'].median(), inplace=True)

In [118]:
df['horsepower'] = df['horsepower'].astype(int)

In [119]:
sns.kdeplot(df['horsepower'])

In [120]:
df.info()

### 3.17 peak-rpm

In [121]:
df['peak-rpm'].isna().sum()

In [122]:
df['peak-rpm'].fillna(df['peak-rpm'].median(),inplace=True)

In [123]:
df['peak-rpm'] = df['peak-rpm'].astype(int)

In [124]:
df['peak-rpm'].isna().sum()

In [125]:
df['peak-rpm'].dtype

### 3.18 price 

In [126]:
df['price']

In [127]:
df['price'].fillna(df['price'].median(), inplace=True)

In [128]:
df['price'] = df['price'].astype(int)

In [129]:
df.info()

In [130]:
df.corr()

In [131]:
df.iloc[:,0:10].corr()

In [132]:
df.columns
col_list = ['symboling', 'normalized-losses', 'fuel-type', 'aspiration',
       'num-of-doors', 'engine-location', 'wheel-base', 'length', 'width',
       'height', 'curb-weight', 'num-of-cylinders', 'engine-size', 'bore','price']
df.loc[:,col_list].corr().tail(1)

# Step 5. Feature Selection

In [133]:
df.shape

## Step 6. Model Training

In [134]:
x = df.drop('price', axis=1)
y = df['price']

x_train,x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=9)

In [135]:
x_train.shape,y_train.shape, x_test.shape, y_test.shape

In [136]:
model = LinearRegression()
model.fit(x_train,y_train)

# step 7. Model Evaluation

In [137]:
## Testing data model Evalaution 

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test,y_pred)
print(f"MSE = {mse}")
print(f"RMSE = {np.sqrt(mse)}")

mae = mean_absolute_error(y_test,y_pred)
print(f"MAE = {mae}")

r2 = r2_score(y_test,y_pred)
print(f"R-Squared Value = {r2}")



In [138]:
## Training data model Evalaution 

y_pred = model.predict(x_train)

mse = mean_squared_error(y_train,y_pred)
print(f"MSE = {mse}")
print(f"RMSE = {np.sqrt(mse)}")

mae = mean_absolute_error(y_train,y_pred)
print(f"MAE = {mae}")

r2 = r2_score(y_train,y_pred)
print(f"R-Squared Value = {r2}")



In [139]:
## Bias and variance Trade off 

Bias >> low Bias 
Variance >> low variance 

Generelaised / best model

In [None]:
model

In [None]:
### Save this model to pickle file

In [None]:
with open('linear_model.pkl','wb') as file: 
    pickle.dump(model,file)

### User Defined Function 

In [None]:
'symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
'highway-mpg'


data = [[]]

price = model.predict(data)
print(price)