In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import set_config
set_config(display = 'diagram')

In [2]:
df = pd.read_csv('advertising.csv')
df

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0
3,151.5,41.3,58.5,16.5
4,180.8,10.8,58.4,17.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,14.0
197,177.0,9.3,6.4,14.8
198,283.6,42.0,66.2,25.5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 6.4 KB


In [4]:
df.isna().sum()

TV           0
Radio        0
Newspaper    0
Sales        0
dtype: int64

In [5]:
X = df.drop(columns = ['Sales'])
X

Unnamed: 0,TV,Radio,Newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4
...,...,...,...
195,38.2,3.7,13.8
196,94.2,4.9,8.1
197,177.0,9.3,6.4
198,283.6,42.0,66.2


In [6]:
y = df['Sales']
y

0      22.1
1      10.4
2      12.0
3      16.5
4      17.9
       ... 
195     7.6
196    14.0
197    14.8
198    25.5
199    18.4
Name: Sales, Length: 200, dtype: float64

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42 )

In [8]:
numeric_feature = X_train.select_dtypes('number').columns

In [9]:
numeric_transformer = make_pipeline(SimpleImputer(strategy = 'median'), StandardScaler())

In [10]:
col_transformer = make_column_transformer(
  (numeric_transformer, numeric_feature), 
remainder = 'passthrough')

In [11]:
col_transformer.fit(X_train)

In [12]:
pipe = make_pipeline(col_transformer, PolynomialFeatures(degree = 2, include_bias=False), LinearRegression())

In [13]:
pipe.fit(X_train, y_train)

In [14]:
from sklearn.model_selection import cross_validate
with_categorical_score = cross_validate(pipe, X_train, y_train, return_train_score = True)

In [15]:
categorical_score = pd.DataFrame(with_categorical_score)
categorical_score

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.0,0.058497,0.926767,0.921452
1,0.009043,0.0,0.868297,0.937134
2,0.0,0.015625,0.924791,0.928593
3,0.015643,0.0,0.953388,0.921944
4,0.015619,0.0,0.891602,0.936188


In [16]:
categorical_score.mean()

fit_time       0.008061
score_time     0.014824
test_score     0.912969
train_score    0.929062
dtype: float64

In [17]:
pipe.score(X_train, y_train)

0.9288133512730626

In [18]:
pipe.score(X_test, y_test)

0.9533174341074796