### 제2유형 연습하기. 팁 예측하기 (회귀)

In [324]:
import pandas as pd
import numpy as np

# 표준화, 정규화
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# 데이터 분리
from sklearn.model_selection import train_test_split

# 지도 학습 모델 선정 (by 랜덤 포레스트)
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# 분류 관련 성능 평가
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

# 회귀 관련 성능 평가
from sklearn.metrics import mean_squared_error, r2_score

In [325]:
import seaborn as sns

df = sns.load_dataset('tips')

# df.head()

x = df.drop(columns='tip')
y = df['tip']

# print(x.columns)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2023)

# print(x_train.shape)
# print(x_test.shape)
# print(y_train.shape)

# x_train.dtypes
# y_train.dtypes

# print(x_train.head())

x_train = pd.DataFrame(x_train.reset_index())
x_test = pd.DataFrame(x_test.reset_index())
y_train = pd.DataFrame(y_train.reset_index())

x_train.rename(columns={'index':'cust_id'}, inplace=True)
x_test.rename(columns={'index':'cust_id'}, inplace=True)
y_train.columns = ['cust_id', 'tip']

# print(x_test.head())
cust_id = x_test['cust_id'].copy()

# print(x_train.head())
# print(y_train.head())

### 레스토랑의 tip 예측 문제

#### - 데이터의 결측치, 이상치, 변수에 대해 처리하고
#### - 회귀모델을 사용하여 Rsq, MSE 값을 산출하시오.

#### 데이터셋 설명
* total_bill(총 청구액) : 손님의 식사 총 청구액
* tip(팁) : 팁의 양
* sex(성별) : 손님의 성별
* smoker(흡연자) : 손님의 흡연 여부 ("Yes" 또는 "No")
* day(요일) : 식사가 이루어진 요일
* time(시간) : 점심 또는 저녁 중 언제 식사가 이루어졌는지
* size(인원 수) : 식사에 참석한 인원수

##### 데이터 탐색(EDA)

In [326]:
#결측치 확인

print(x_train.info())
print(x_test.info())
print(y_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   cust_id     195 non-null    int64   
 1   total_bill  195 non-null    float64 
 2   sex         195 non-null    category
 3   smoker      195 non-null    category
 4   day         195 non-null    category
 5   time        195 non-null    category
 6   size        195 non-null    int64   
dtypes: category(4), float64(1), int64(2)
memory usage: 6.0 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   cust_id     49 non-null     int64   
 1   total_bill  49 non-null     float64 
 2   sex         49 non-null     category
 3   smoker      49 non-null     category
 4   day         49 non-null     category
 5   time        49 non-null     category
 6   size   

In [327]:
print(x_train.isnull().sum())
print(x_test.isnull().sum())
print(y_train.isnull().sum())

cust_id       0
total_bill    0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64
cust_id       0
total_bill    0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64
cust_id    0
tip        0
dtype: int64


In [328]:
# 데이터 형태 확인
print(x_train.dtypes)

cust_id          int64
total_bill     float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object


In [329]:
# 각 카테고리별 종류 확인 (x_train)

print(x_train['sex'].value_counts())
print(x_train['smoker'].value_counts())
print(x_train['day'].value_counts())
print(x_train['time'].value_counts())

Male      125
Female     70
Name: sex, dtype: int64
No     120
Yes     75
Name: smoker, dtype: int64
Sat     71
Sun     62
Thur    47
Fri     15
Name: day, dtype: int64
Dinner    142
Lunch      53
Name: time, dtype: int64


In [330]:
# 각 카테고리별 종류 확인 (x_test)

print(x_test['sex'].value_counts())
print(x_test['smoker'].value_counts())
print(x_test['day'].value_counts())
print(x_test['time'].value_counts())

Male      32
Female    17
Name: sex, dtype: int64
No     31
Yes    18
Name: smoker, dtype: int64
Sat     16
Thur    15
Sun     14
Fri      4
Name: day, dtype: int64
Dinner    34
Lunch     15
Name: time, dtype: int64


In [331]:
# 연속값 - 이상치 확인
# 범주형 데이터의 경우 include = 'category' | 'object'

# print(x_train.describe(include='category').T)
# print(x_test.describe(include='category').T)
print(x_train.describe().T)
print(x_test.describe().T)
print(y_train.describe().T)

            count        mean        std   min    25%     50%      75%     max
cust_id     195.0  122.056410  70.668034  0.00  59.50  121.00  182.500  243.00
total_bill  195.0   20.054667   8.961645  3.07  13.42   17.92   24.395   50.81
size        195.0    2.543590   0.942631  1.00   2.00    2.00    3.000    6.00
            count        mean        std   min    25%     50%     75%    max
cust_id      49.0  119.285714  70.918674  2.00  62.00  123.00  180.00  239.0
total_bill   49.0   18.716531   8.669864  5.75  12.74   16.66   21.01   44.3
size         49.0    2.673469   0.987162  2.00   2.00    2.00    3.00    6.0
         count        mean        std  min   25%     50%     75%    max
cust_id  195.0  122.056410  70.668034  0.0  59.5  121.00  182.50  243.0
tip      195.0    3.021692   1.402690  1.0   2.0    2.92    3.53   10.0


In [332]:
# y 데이터 분석

print(y_train.head())

   cust_id   tip
0      158  2.61
1      186  3.50
2       21  2.75
3       74  2.20
4       43  1.32


#### 원핫 인코딩 및 모델링

In [333]:
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

x_train.drop(columns={'cust_id'}, axis=1, inplace=True)
x_test.drop(columns={'cust_id'}, axis=1, inplace=True)

print(x_train.shape)
print(x_test.shape)

(195, 12)
(49, 12)


In [334]:
print(x_train.head(3))
print(x_test.head(3))

   total_bill  size  sex_Male  sex_Female  smoker_Yes  smoker_No  day_Thur  \
0       13.39     2         0           1           0          1         0   
1       20.90     3         0           1           1          0         0   
2       20.29     2         0           1           0          1         0   

   day_Fri  day_Sat  day_Sun  time_Lunch  time_Dinner  
0        0        0        1           0            1  
1        0        0        1           0            1  
2        0        1        0           0            1  
   total_bill  size  sex_Male  sex_Female  smoker_Yes  smoker_No  day_Thur  \
0       19.77     4         1           0           0          1         0   
1       24.59     4         0           1           0          1         0   
2        9.55     2         1           0           0          1         0   

   day_Fri  day_Sat  day_Sun  time_Lunch  time_Dinner  
0        0        0        1           0            1  
1        0        0        1          

In [335]:
# 모델링 적용

model = RandomForestRegressor()

model.fit(x_train, y_train['tip'])

In [336]:
# 검증 데이터 분리 및 성능 측정

X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train['tip'], test_size=0.2, random_state=23)

model_val = RandomForestRegressor()
model_val.fit(X_train, Y_train)

y_pred = model_val.predict(X_val)

print(y_pred.shape)
print(Y_val.shape)

(39,)
(39,)


In [337]:
mse = mean_squared_error(Y_val, y_pred)
rsq = r2_score(Y_val, y_pred)

print("MSE : ", mse)
print("R2 Score : ", rsq)
print("RMSE : ", mse**2)

MSE :  1.010733282564102
R2 Score :  0.41146924198196044
RMSE :  1.0215817684828048


In [338]:
# 실제 예측값 적용



y_result = model.predict(x_test)

print(y_result.shape)
# print(y_test.shape)

(49,)


In [339]:
# 결과값 가공 및 저장

final_result = pd.DataFrame({'cust_id':cust_id, 'result':y_result})

print(final_result)

final_result.to_csv('result.csv', index=False)

    cust_id  result
0       154  3.2726
1         4  2.9059
2        30  1.6660
3        75  1.6438
4        33  3.1181
5       149  1.4469
6       164  2.9235
7        61  2.2072
8       136  1.8987
9       211  3.5402
10      102  4.4278
11      132  1.9858
12      205  2.8505
13      185  3.2585
14       99  2.1420
15       92  1.5273
16       91  3.7163
17      117  1.9291
18      207  4.3078
19      239  3.3520
20      147  2.0672
21      174  2.7447
22        2  3.2302
23      141  4.8880
24       81  2.9451
25      180  3.9564
26      151  2.4156
27      206  3.1855
28      238  4.9943
29       24  3.1584
30       42  2.4493
31       96  2.7677
32       84  2.2739
33      123  2.2739
34        8  2.6288
35       62  1.7001
36      201  2.1357
37      172  1.7421
38       68  3.1116
39      191  3.0421
40      215  2.2695
41       79  3.0245
42       16  2.3242
43       13  3.0356
44       32  2.4278
45      204  3.6830
46      202  2.1024
47      167  5.2858
48       40  2.6315


In [340]:
# 제출한 결과 확인

print(pd.read_csv('result.csv'))

    cust_id  result
0       154  3.2726
1         4  2.9059
2        30  1.6660
3        75  1.6438
4        33  3.1181
5       149  1.4469
6       164  2.9235
7        61  2.2072
8       136  1.8987
9       211  3.5402
10      102  4.4278
11      132  1.9858
12      205  2.8505
13      185  3.2585
14       99  2.1420
15       92  1.5273
16       91  3.7163
17      117  1.9291
18      207  4.3078
19      239  3.3520
20      147  2.0672
21      174  2.7447
22        2  3.2302
23      141  4.8880
24       81  2.9451
25      180  3.9564
26      151  2.4156
27      206  3.1855
28      238  4.9943
29       24  3.1584
30       42  2.4493
31       96  2.7677
32       84  2.2739
33      123  2.2739
34        8  2.6288
35       62  1.7001
36      201  2.1357
37      172  1.7421
38       68  3.1116
39      191  3.0421
40      215  2.2695
41       79  3.0245
42       16  2.3242
43       13  3.0356
44       32  2.4278
45      204  3.6830
46      202  2.1024
47      167  5.2858
48       40  2.6315


In [341]:
# 성능 평가

mse = mean_squared_error(y_test, y_result)
rsq = r2_score(y_test, y_result)

print("MSE : ", mse)
print("R2 Score : ", rsq)
print("RMSE : ", mse**2)

MSE :  0.972653893265306
R2 Score :  0.4256291839049009
RMSE :  0.9460555960841573
