<h2 align="center"> Train-test split </h2>

In [56]:
import pandas as pd  

## Auto MPG dataset URL
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

## The dataset uses various delimiters and contains missing values denoted as '?'
df = pd.read_csv(url, delim_whitespace=True, names=column_names, na_values='?', comment='\t')
df.shape

  df = pd.read_csv(url, delim_whitespace=True, names=column_names, na_values='?', comment='\t')


(398, 9)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car_name      398 non-null    object 
dtypes: float64(5), int64(3), object(1)
memory usage: 28.1+ KB


In [58]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [59]:
## Column names based on the dataset description
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']

## Displaying few rows of the dataframe
df.sample(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
84,27.0,4,97.0,88.0,2100.0,16.5,72,3,toyota corolla 1600 (sw)
146,28.0,4,90.0,75.0,2125.0,14.5,74,1,dodge colt
384,32.0,4,91.0,67.0,1965.0,15.7,82,3,honda civic (auto)
168,23.0,4,140.0,83.0,2639.0,17.0,75,1,ford pinto
29,27.0,4,97.0,88.0,2130.0,14.5,71,3,datsun pl510


In [60]:
df.isna().sum() ## Checking Missing Values

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [61]:
df.horsepower.describe()  ## Descriptive Statistics

count    392.000000
mean     104.469388
std       38.491160
min       46.000000
25%       75.000000
50%       93.500000
75%      126.000000
max      230.000000
Name: horsepower, dtype: float64

In [62]:
value = df.horsepower.mean()
df.horsepower.fillna(value, inplace = True) ### 4. Handling Missing Data : Filling missing values in the 'horsepower' column with its mean value.

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.horsepower.fillna(value, inplace = True) ### 4. Handling Missing Data : Filling missing values in the 'horsepower' column with its mean value.


In [63]:
df.isna().sum() ## Verifying Missing Value Handling

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [64]:
features = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin']

X = df[features] # standard convention -> X is capital as it has multiple columns
y = df['mpg'] # y is small as it has only one column

X.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504.0,12.0,70,1
1,8,350.0,165.0,3693.0,11.5,70,1
2,8,318.0,150.0,3436.0,11.0,70,1
3,8,304.0,150.0,3433.0,12.0,70,1
4,8,302.0,140.0,3449.0,10.5,70,1


In [65]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) ## 20% test and 80% in training

In [67]:
df.shape,X_train.shape, X_test.shape, y_train.shape, y_test.shape

((398, 9), (318, 7), (80, 7), (318,), (80,))

In [68]:
model = LinearRegression()
model.fit(X_train, y_train)

In [69]:
y_pred = model.predict(X_test)
y_pred[:5]

array([32.86345735, 29.58130242, 21.35126058, 16.80568721, 12.50136229])

In [70]:
y_test[:5] ## when value was 33 it predicted 32.86 and so on...it's doing a close match

198    33.0
396    28.0
33     19.0
208    13.0
93     14.0
Name: mpg, dtype: float64

In [71]:
## to actually measure it we can use

In [72]:
## 1. MSE

In [73]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(mse, mae)

8.19545210407378 2.25338840593176


In [74]:
## 2. R2 score -> easy to interpret

In [75]:
r2_score(y_test, y_pred)

0.8475731044779435

In [76]:
model.score(X_test, y_test) ## internally its the same thing [It takes X_test and generates y_pred]

0.8475731044779435