In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn import set_config
set_config(display = 'diagram')

In [2]:
df = pd.read_csv('Social_Network_Ads.csv')
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [3]:
df.isna().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [5]:
X = df.drop(columns= ['Purchased'])
y = df['Purchased']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42 )

In [7]:
X_train.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary
3,15603246,Female,27,57000
18,15704583,Male,46,28000
202,15735549,Female,39,134000
250,15810075,Female,44,39000
274,15692819,Female,57,26000


In [8]:
numeric_feature = X_train.select_dtypes('number').columns
categorical_features = X_train.select_dtypes('object').columns

In [9]:
numeric_transformer = make_pipeline(SimpleImputer(strategy = 'median'), StandardScaler())

In [10]:
categorical_transformer = make_pipeline(SimpleImputer(strategy = 'constant', fill_value = "missing"), OneHotEncoder())

In [11]:
col_transformer = make_column_transformer(
  (numeric_transformer, numeric_feature), 
    (categorical_transformer, categorical_features),
remainder = 'passthrough')

In [12]:
col_transformer.fit(X_train)

In [13]:
pipe = make_pipeline(col_transformer, SVR())

In [14]:
pipe.fit(X_train, y_train)

In [15]:
from sklearn.model_selection import cross_validate
with_categorical_score = cross_validate(pipe, X_train, y_train, return_train_score = True)

In [16]:
categorical_score = pd.DataFrame(with_categorical_score)
categorical_score

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.073368,0.015621,0.795934,0.66789
1,0.031243,0.01562,0.650135,0.705756
2,0.03124,0.015623,0.690988,0.690631
3,0.015621,0.015619,0.339165,0.770832
4,0.031242,0.0,0.570125,0.714782


In [17]:
pipe.predict(X_test)

array([ 0.87625631,  0.86736248, -0.07931743,  0.69217123, -0.05313089,
       -0.13901417,  0.69190223,  0.02817457,  0.16325828,  0.19807422,
       -0.10076782,  0.89433916,  0.01065316,  0.07582655,  0.15985929,
        0.70858049,  1.00049024,  0.3911424 ,  0.10012762,  0.99641551,
        0.10678977,  0.05147133,  0.74980205,  0.84336177,  0.12759535,
        1.20580316,  0.31666013, -0.06163329,  0.92748707,  0.05981917,
        0.63502034, -0.09711366,  1.10004291,  0.00401637,  0.75138167,
        0.06231008, -0.11532233, -0.03335393,  0.00189706, -0.00192054,
        0.84640603, -0.05932713, -0.01590288,  0.90495651, -0.03506879,
        1.00725144, -0.13829486, -0.09997341,  0.75073696, -0.0057964 ,
       -0.01054077,  0.74613218, -0.14669686, -0.03868968,  0.03029141,
        0.00192769,  1.18289547,  0.51794472, -0.0032324 ,  0.01259342,
        0.17137968, -0.03461878,  0.77194433,  0.14285665, -0.1122674 ,
        0.9451779 ,  0.3240872 ,  1.01618762, -0.05060527,  0.68

In [18]:
pipe.score(X_train, y_train)

0.7037493762398916

In [19]:
pipe.score(X_test, y_test)

0.7330628059965907