In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder
from sklearn import set_config
set_config(display = 'diagram')

In [2]:
df = pd.read_csv('M6_T2_V1_Diamonds.csv')

In [3]:
df.head()

Unnamed: 0,carat,cut,color,clarity,price,x,y,z
0,0.23,Ideal,E,SI2,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,335,4.34,4.35,2.75


In [4]:
X = df.drop(columns= ['price'])
y = df['price']

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   price    53940 non-null  int64  
 5   x        53940 non-null  float64
 6   y        53940 non-null  float64
 7   z        53940 non-null  float64
dtypes: float64(4), int64(1), object(3)
memory usage: 3.3+ MB


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42 )

In [7]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,x,y,z
26546,2.01,Good,F,SI2,8.23,8.19,4.77
9159,1.01,Very Good,E,SI2,6.57,6.49,3.92
14131,1.1,Premium,H,VS2,6.59,6.54,4.1
15757,1.5,Good,E,SI2,7.21,7.17,4.42
24632,1.52,Very Good,G,VS1,7.27,7.32,4.53


In [8]:
numeric_feature = X_train.select_dtypes('number').columns
categorical_features = X_train.select_dtypes('object').columns

In [9]:
numeric_transformer = make_pipeline(SimpleImputer(strategy = 'median'), StandardScaler())

In [10]:
categorical_transformer = make_pipeline(SimpleImputer(strategy = 'constant', fill_value = "missing"), OneHotEncoder())

In [11]:
col_transformer = make_column_transformer(
  (numeric_transformer, numeric_feature), 
    (categorical_transformer, categorical_features),
remainder = 'passthrough')

In [12]:
col_transformer.fit(X_train)

In [13]:
pipe = make_pipeline(col_transformer, LinearRegression())


In [14]:
pipe.fit(X_train, y_train)

In [15]:
from sklearn.model_selection import cross_validate
with_categorical_score = cross_validate(pipe, X_train, y_train, return_train_score = True)

In [16]:
categorical_score = pd.DataFrame(with_categorical_score)
categorical_score

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.123035,0.015623,0.921709,0.919143
1,0.109371,0.015623,0.91709,0.920282
2,0.109344,0.015625,0.913574,0.921002
3,0.10937,0.015622,0.922482,0.918904
4,0.124986,0.015624,0.921727,0.919118


In [17]:
categorical_score.mean()

fit_time       0.115221
score_time     0.015623
test_score     0.919316
train_score    0.919690
dtype: float64

In [18]:
pipe.predict(X_test)

array([ 722.06692488, 3088.69247646, 1953.96118851, ...,  722.22990483,
       8708.19560809, 3105.20448876])

In [19]:
pipe.score(X_train, y_train)

0.9196548453858651

In [20]:
pipe.score(X_test, y_test)

0.9186672428019742