In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
df = pd.read_csv('./data/50_Startups.csv')

In [3]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
df.isna().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [6]:
df.sort_values('Marketing Spend', ascending=True)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
19,86419.7,153514.11,0.0,New York,122776.86
47,0.0,135426.92,0.0,California,42559.73
48,542.05,51743.15,0.0,New York,35673.41
45,1000.23,124153.04,1903.93,New York,64926.08
44,22177.74,154806.14,28334.72,California,65200.33
43,15505.73,127382.3,35534.17,New York,69758.98
49,0.0,116983.8,45173.06,California,14681.4
32,63408.86,129219.61,46085.25,California,97427.84
31,61136.38,152701.92,88218.23,New York,97483.56
30,61994.48,115641.28,91131.24,Florida,99937.59


In [7]:
y = df['Profit']

In [8]:
X = df.drop('Profit', axis=1)

In [9]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [10]:
X.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
count,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978
std,45902.256482,28017.802755,122290.310726
min,0.0,51283.14,0.0
25%,39936.37,103730.875,129300.1325
50%,73051.08,122699.795,212716.24
75%,101602.8,144842.18,299469.085
max,165349.2,182645.56,471784.1


In [11]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [12]:
sorted(X['State'].unique())

['California', 'Florida', 'New York']

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
from sklearn.compose import ColumnTransformer

In [15]:
ct = ColumnTransformer([('onehot', OneHotEncoder(), [3])], remainder='passthrough')

In [16]:
# 데이터 프리프로세싱 단계에서
# 문자열을 숫자로 바꾸는 작업, 피처 스케일링 파이프라인으로 처리하는게 좋다

In [17]:
from sklearn.linear_model import LinearRegression

In [18]:
regressor = LinearRegression()

In [19]:
from sklearn.pipeline import Pipeline

In [25]:
X_train

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
40,28754.33,118546.05,172795.67,California
17,94657.16,145077.58,282574.31,New York
18,91749.16,114175.79,294919.57,Florida
0,165349.2,136897.8,471784.1,New York
27,72107.6,127864.55,353183.81,New York
33,55493.95,103057.49,214634.81,Florida
36,28663.76,127056.21,201126.82,Florida
42,23640.93,96189.63,148001.11,California
10,101913.08,110594.11,229160.95,Florida
3,144372.41,118671.85,383199.62,New York


In [21]:
pipe = Pipeline(steps=[('preprocessing', ct), 
                       ('modeling', regressor)])

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

In [24]:
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('modeling', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [26]:
y_pred = pipe.predict(X_test)

In [27]:
from sklearn.metrics import mean_squared_error, r2_score

In [28]:
mean_squared_error(y_test, y_pred)

103140891.12164843

In [29]:
r2_score(y_test, y_pred)

0.9438509847889336

In [30]:
from sklearn.ensemble import RandomForestRegressor

In [31]:
regressor2 = RandomForestRegressor()

In [32]:
pipe2 = Pipeline(steps=[('preprocessing', ct), 
                       ('modeling', regressor2)])

In [33]:
pipe2.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('modeling', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [34]:
y_pred2 = pipe2.predict(X_test)

In [35]:
mean_squared_error(y_test, y_pred2)

147077619.3436918

In [36]:
r2_score(y_test, y_pred2)

0.919932207333789

In [37]:
from xgboost import XGBRegressor

In [38]:
regressor3 = XGBRegressor()

In [39]:
pipe3 = Pipeline(steps=[('preprocessing', ct), 
                       ('modeling', regressor3)])

In [40]:
pipe3.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('modeling', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [41]:
y_pred3 = pipe3.predict(X_test)

In [42]:
mean_squared_error(y_test, y_pred3)

300487368.955356

In [47]:
r2_score(y_test, y_pred3)

0.836417257338721

In [44]:
import os
os.mkdir('./model')

FileExistsError: [WinError 183] 파일이 이미 있으므로 만들 수 없습니다: './model'

In [48]:
import joblib

In [49]:
joblib.dump(pipe, './model/pipe.pkl')

['./model/pipe.pkl']