In [2]:
#import streamlit as st
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly_express as px
import os
import joblib 

from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
# read csv into a dataframe
df = pd.read_csv("uploaded_data.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [5]:
df.isnull().values.any()


False

In [6]:
df.shape

(398, 10)

In [7]:
model_df = df.drop(['Unnamed: 0','car name'],axis = 1)

In [8]:
model_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1


In [9]:
# note horsepower is an object
model_df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year        int64
origin            int64
dtype: object

In [10]:
model_df.isnull().values.any()

False

In [11]:
model_df['horsepower'] = pd.to_numeric(model_df['horsepower'],errors='coerce')

In [12]:
# this created null values
model_df.isnull().values.any()

True

In [13]:
model_df.isnull().values.sum()

6

In [14]:
model_df['horsepower'] = model_df['horsepower'].fillna(np.mean(model_df['horsepower']))

In [15]:
# this created null values
model_df.isnull().values.any()

False

In [16]:
model_df['origin'].value_counts()

1    249
3     79
2     70
Name: origin, dtype: int64

In [17]:
model_df['cylinders'].value_counts()

4    204
8    103
6     84
3      4
5      3
Name: cylinders, dtype: int64

In [18]:
# one hot encode origin and cylinders

encoder = OneHotEncoder(sparse=False)

cyln_encode = pd.DataFrame(encoder.fit_transform(model_df[['cylinders']]))
cyln_encode.columns = ['4 cyln','8 cyln','6 cyln','3 cyln','5 cyln']

In [19]:
cyln_encode.head()

Unnamed: 0,4 cyln,8 cyln,6 cyln,3 cyln,5 cyln
0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0


In [20]:
cyln_encode.columns

Index(['4 cyln', '8 cyln', '6 cyln', '3 cyln', '5 cyln'], dtype='object')

In [21]:
# one hot encode origin and cylinders

# origin
origin_encode = pd.DataFrame(encoder.fit_transform(model_df[['origin']]))
origin_encode.columns = ['origin one','origin two','origin three']

In [22]:
origin_encode.head()

Unnamed: 0,origin one,origin two,origin three
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0


In [23]:
origin_encode.columns

Index(['origin one', 'origin two', 'origin three'], dtype='object')

In [24]:
temp_df = model_df.join(cyln_encode)
model_df = temp_df.join(origin_encode)

In [25]:
model_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,4 cyln,8 cyln,6 cyln,3 cyln,5 cyln,origin one,origin two,origin three
0,18.0,8,307.0,130.0,3504,12.0,70,1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,15.0,8,350.0,165.0,3693,11.5,70,1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,18.0,8,318.0,150.0,3436,11.0,70,1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,16.0,8,304.0,150.0,3433,12.0,70,1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,17.0,8,302.0,140.0,3449,10.5,70,1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [26]:
# drop cylinders and origin

model_df = model_df.drop(['cylinders','origin'], axis = 1)

In [27]:
model_df.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model year,4 cyln,8 cyln,6 cyln,3 cyln,5 cyln,origin one,origin two,origin three
0,18.0,307.0,130.0,3504,12.0,70,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,15.0,350.0,165.0,3693,11.5,70,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,18.0,318.0,150.0,3436,11.0,70,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,16.0,304.0,150.0,3433,12.0,70,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,17.0,302.0,140.0,3449,10.5,70,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [28]:
model_df.columns

Index(['mpg', 'displacement', 'horsepower', 'weight', 'acceleration',
       'model year', '4 cyln', '8 cyln', '6 cyln', '3 cyln', '5 cyln',
       'origin one', 'origin two', 'origin three'],
      dtype='object')

In [29]:
model_df.dtypes

mpg             float64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model year        int64
4 cyln          float64
8 cyln          float64
6 cyln          float64
3 cyln          float64
5 cyln          float64
origin one      float64
origin two      float64
origin three    float64
dtype: object

In [37]:
# modeling

# seperate target from the features
#clau comment: added .values to get the data values only and not the column names
##if column names in model, you will get the warning in the predict page
##because you are sending and array of data but the given array has no columns names, which triggers the warning
##as long as the order of the variables remain the same from the model to the new input to predict, it should not cause any issues

y = model_df['mpg'].values
X = model_df.drop(['mpg'], axis = 1).values

In [38]:
X

array([[3.070e+02, 1.300e+02, 3.504e+03, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [3.500e+02, 1.650e+02, 3.693e+03, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [3.180e+02, 1.500e+02, 3.436e+03, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [1.350e+02, 8.400e+01, 2.295e+03, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [1.200e+02, 7.900e+01, 2.625e+03, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [1.190e+02, 8.200e+01, 2.720e+03, ..., 1.000e+00, 0.000e+00,
        0.000e+00]])

In [39]:
# split data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [40]:
# check
X_train.shape,X_test.shape,model_df.shape

((318, 13), (80, 13), (398, 14))

In [54]:
#X_train.isnull().values.any()

In [42]:
# ridge regression

# scale the data

model = RidgeCV()
scaler = StandardScaler().fit(X_train)

In [43]:
# save the model to disk
joblib.dump(scaler,"ridge_scalar.sav")

['ridge_scalar.sav']

In [44]:
X_scaled = scaler.transform(X_train)

In [45]:
len(X_scaled)

318

In [46]:
model.fit(X_scaled,y_train)

In [47]:
model.score(X_scaled, y_train)

0.8491124997817235

In [48]:
X_scaled_test = scaler.transform(X_test)

In [49]:
model.score(X_scaled_test, y_test)

0.8263823761161354

In [50]:
model.predict(X_scaled_test),y_test

(array([29.74605908, 27.21086557, 29.28903059, 28.09282072, 28.04686292,
        25.87117207, 17.59758195, 33.99751187, 11.90856411, 13.14579877,
        21.68534898, 29.66431378, 30.00664279, 23.92839143, 22.86563821,
        15.54246173, 20.76858288, 31.08487531, 26.52230135, 36.34751463,
        27.73384137, 34.55843926, 29.67014316, 26.36140775, 28.77162083,
        30.05835176, 15.16682286, 30.23668688, 11.05261217, 28.45953657,
        14.34600276, 30.70395442, 25.2341929 , 15.4763921 , 29.3073522 ,
        17.97281034,  9.73382072, 29.64038098, 29.69363907, 20.46530762,
        25.17166379, 30.1047521 , 27.4121892 , 34.65776335, 12.60280831,
        24.06533818, 17.78650619, 18.26852246, 17.39626378, 28.57453848,
        27.2667389 , 23.431746  , 19.17818499,  9.64110598, 29.16031934,
        24.94932984, 22.26459309, 31.84519178, 28.64942177, 35.79187765,
        15.86717626, 34.88012673, 19.07070815, 33.50978723, 19.04190354,
        18.09882401, 31.71378219, 22.19756359, 32.5

In [51]:
len(model.coef_)

13

In [52]:
print("MSE:", mean_squared_error(y_test, model.predict(X_scaled_test)))
print("R2:", r2_score(y_test, model.predict(X_scaled_test)))

MSE: 10.689155313623269
R2: 0.8263823761161354


In [53]:
# save the model to disk
joblib.dump(model,"ridge_model.sav")

['ridge_model.sav']