# <center>Automobile: Miles Per Gallon</center>
---

# Regression

## Data Understanding

In [1]:
import numpy as np
import pandas as pd
import sklearn.ensemble as ensemble
import sklearn.metrics as metrics
import sklearn.preprocessing as pre_pro
import sklearn.feature_selection as feat_sel
from sklearn.model_selection import train_test_split

In [2]:
col_names = ["mpg","cylinders","displacement",\
                "horsepower","weight","acceleration","model year","origin","car name"]

data = pd.read_csv("data/auto-mpg.csv",names=col_names,header=None,delim_whitespace=True)

In [3]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0,chevrolet chevelle malibu
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0,buick skylark 320
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0,plymouth satellite
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0,amc rebel sst
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0,ford torino


In [4]:
data.origin.unique()

array([1., 2., 3.])

### [Data Description](https://archive.ics.uci.edu/ml/datasets/auto+mpg)

This dataset is a slightly modified version of the dataset provided in the StatLib library. In line with the use by Ross Quinlan (1993) in predicting the attribute "mpg", 8 of the original instances were removed because they had unknown values for the "mpg" attribute. The original dataset is available in the file "auto-mpg.data-original". 

"The data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes." (Quinlan, 1993).

### Attribute Information

1. mpg: continuous `target`
2. cylinders: multi-valued discrete 
3. displacement: continuous 
4. horsepower: continuous 
5. weight: continuous 
6. acceleration: continuous 
7. model year: multi-valued discrete 
8. origin: multi-valued discrete: 1-> USA, 2-> Europe, 3->Japan
9. car name: string (unique for each instance):

## Data Preparation

### Data Shape

In [5]:
data.shape

(406, 9)

In [6]:
data.isna().sum()

mpg             8
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

### Handling Missing Data

In [7]:
data["mpg"] = data["mpg"].fillna(data["mpg"].mean())
data["horsepower"] = data["horsepower"].fillna(data["mpg"].mean())

In [8]:
data.drop("car name",inplace=True,axis=1)

In [9]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mpg,406.0,23.514573,7.738404,9.0,17.5,23.0,29.0,46.6
cylinders,406.0,5.475369,1.71216,3.0,4.0,4.0,8.0,8.0
displacement,406.0,194.779557,104.922458,68.0,105.0,151.0,302.0,455.0
horsepower,406.0,103.877063,39.722317,23.514573,75.0,93.5,129.0,230.0
weight,406.0,2979.413793,847.004328,1613.0,2226.5,2822.5,3618.25,5140.0
acceleration,406.0,15.519704,2.803359,8.0,13.7,15.5,17.175,24.8
model year,406.0,75.921182,3.748737,70.0,73.0,76.0,79.0,82.0
origin,406.0,1.568966,0.797479,1.0,1.0,1.0,2.0,3.0


### Feature Scaling

In [10]:
min_max_scaler = pre_pro.MinMaxScaler(feature_range=(0,3))

In [11]:
data[data.columns] = min_max_scaler.fit_transform(data)

In [12]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,0.718085,3.0,1.852713,1.547113,1.608449,0.714286,0.0,0.0
1,0.478723,3.0,2.186047,2.055623,1.769209,0.625,0.0,0.0
2,0.718085,3.0,1.937984,1.83769,1.55061,0.535714,0.0,0.0
3,0.558511,3.0,1.829457,1.83769,1.548058,0.714286,0.0,0.0
4,0.638298,3.0,1.813953,1.692402,1.561667,0.446429,0.0,0.0


In [13]:
data.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
dtype: int64

In [14]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mpg,406.0,1.158078,0.617426,0.0,0.678191,1.117021,1.595745,3.0
cylinders,406.0,1.485222,1.027296,0.0,0.6,0.6,3.0,3.0
displacement,406.0,0.982787,0.813352,0.0,0.286822,0.643411,1.813953,3.0
horsepower,406.0,1.167576,0.57712,0.0,0.748025,1.016809,1.532584,3.0
weight,406.0,1.162246,0.720446,0.0,0.521832,1.028778,1.705628,3.0
acceleration,406.0,1.342804,0.5006,0.0,1.017857,1.339286,1.638393,3.0
model year,406.0,1.480296,0.937184,0.0,0.75,1.5,2.25,3.0
origin,406.0,0.853448,1.196218,0.0,0.0,0.0,1.5,3.0


### Splitting data

In [15]:
X = data.drop("mpg",axis=1)
y = data["mpg"]

In [16]:
X.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,3.0,1.852713,1.547113,1.608449,0.714286,0.0,0.0
1,3.0,2.186047,2.055623,1.769209,0.625,0.0,0.0
2,3.0,1.937984,1.83769,1.55061,0.535714,0.0,0.0
3,3.0,1.829457,1.83769,1.548058,0.714286,0.0,0.0
4,3.0,1.813953,1.692402,1.561667,0.446429,0.0,0.0


In [17]:
y.head()

0    0.718085
1    0.478723
2    0.718085
3    0.558511
4    0.638298
Name: mpg, dtype: float64

In [18]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=11)

## Modelling

In [19]:
model = ensemble.RandomForestRegressor(n_estimators=300,max_depth=5,random_state=11)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

## Evaluation

In [20]:
model.score(X_test,y_test)

0.8534995798684861

In [21]:
# mse
mse = metrics.mean_squared_error(y_test,y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.05376361804129654


In [22]:
# rmse
rmse = np.sqrt(mse)
print(f"Mean Squared Error: {rmse}")

Mean Squared Error: 0.23186982995054906


In [23]:
# mae
metrics.mean_absolute_error(y_test,y_pred)

0.17926234605132338

## Feature Selection | Mutual Information for Regression

In [24]:
X1 = X.copy(deep=True)
y1 = y.copy(deep=True)

In [25]:
X1.dtypes

cylinders       float64
displacement    float64
horsepower      float64
weight          float64
acceleration    float64
model year      float64
origin          float64
dtype: object

In [26]:
# creating mask for discrete values
discrete = [True, False, False, False, False, True, True]

In [27]:
mutual_info = feat_sel.mutual_info_regression(X1,y1,discrete_features=discrete,random_state=11)
mutual_info

array([0.6114123 , 0.75444606, 0.66874759, 0.73806993, 0.17910271,
       0.37617608, 0.23572661])

**Selecting features with mutual information > 0.3**

In [28]:
X_train = X_train[X_train.columns[mutual_info>0.3]]
X_test = X_test[X_test.columns[mutual_info>0.3]]

In [29]:
X_train.head(2)

Unnamed: 0,cylinders,displacement,horsepower,weight,model year
49,3.0,2.44186,2.273556,2.842642,0.25
235,1.8,1.410853,1.082189,1.626311,1.75


In [30]:
X_test.head(2)

Unnamed: 0,cylinders,displacement,horsepower,weight,model year
60,0.6,0.023256,0.602736,0.136093,0.25
207,1.8,1.410853,0.791612,1.66799,1.5


### Modelling

In [31]:
model = ensemble.RandomForestRegressor(n_estimators=300,max_depth=5,random_state=11)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

### Evaluation

In [32]:
model.score(X_test,y_test)

0.8428974942156

In [33]:
# mse
mse = metrics.mean_squared_error(y_test,y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.057654436122030944


In [34]:
# rmse
rmse = np.sqrt(mse)
print(f"Mean Squared Error: {rmse}")

Mean Squared Error: 0.24011338180541072


In [35]:
# mae
metrics.mean_absolute_error(y_test,y_pred)

0.185592119612564

### Features before & after

In [36]:
X.shape,X_train.shape

((406, 7), (324, 5))