In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
# Load the data
data = pd.read_csv('carnegie_mellon_output_summary_final.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647614 entries, 0 to 647613
Data columns (total 26 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   style_name            647614 non-null  object 
 1   state                 647614 non-null  object 
 2   city_district         647614 non-null  object 
 3   city                  647614 non-null  object 
 4   size                  647614 non-null  int64  
 5   color                 647614 non-null  int64  
 6   temple_length         647614 non-null  int64  
 7   color_description     647614 non-null  object 
 8   color_family          647614 non-null  object 
 9   hex_color             647614 non-null  object 
 10  lens size (a)         647614 non-null  float64
 11  lens height (b)       647614 non-null  float64
 12  Lens diameter (ed)    647614 non-null  float64
 13  bridge size (dbl)     647614 non-null  float64
 14  circumference         647614 non-null  float64
 15  

In [4]:
data.columns

Index(['style_name', 'state', 'city_district', 'city', 'size', 'color',
       'temple_length', 'color_description', 'color_family', 'hex_color',
       'lens size (a)', 'lens height (b)', 'Lens diameter (ed)',
       'bridge size (dbl)', 'circumference', 'division_name',
       'frame_construction', 'frame_shape_code', 'frame_shape', 'gender_code',
       'gender', 'gross_weight', 'brand_name', 'front_material_name',
       'temple_material_name', 'order_qty'],
      dtype='object')

In [5]:
data.head()

Unnamed: 0,style_name,state,city_district,city,size,color,temple_length,color_description,color_family,hex_color,...,frame_construction,frame_shape_code,frame_shape,gender_code,gender,gross_weight,brand_name,front_material_name,temple_material_name,order_qty
0,style555,NC,GUILFORD,GREENSBORO,5116,1,135,ONYX MATTE W/STAR PHOSPHO T,BLACK,#000000,...,FULL RIM,M,MODIFIED RECTANGLE,C,CHILD,141.75,brand10,BIO INJ-G820,BIO INJ-G850,2610
1,style555,NC,GUILFORD,GREENSBORO,5116,4,135,BLACK MATTE W/STAR PHOSPHO,BLACK,#000000,...,FULL RIM,M,MODIFIED RECTANGLE,C,CHILD,141.75,brand10,BIO INJ-G820,BIO INJ-G850,2428
2,style100,GA,GWINNETT,LAWRENCEVILLE,5316,259,140,HONEY TORTOISE,BROWN,#BAA38A,...,FULL RIM,M,MODIFIED RECTANGLE,F,FEMALE,226.8,brand2,ACETATE,ACETATE,2300
3,style928,GA,GWINNETT,LAWRENCEVILLE,5517,220,145,TORTOISE/GREEN,BROWN,#704628,...,FULL RIM,S,SQUARE,U,UNISEX,141.75,brand15,BIO INJECTED,BIO INJECTED,2146
4,style301,GA,GWINNETT,LAWRENCEVILLE,5516,237,135,DARK TORTOISE/RED,BROWN,#B68963,...,FULL RIM,C,CAT EYE,F,FEMALE,226.8,brand5,ACETATE,ACETATE,2101


In [6]:
unused_columns = ['hex_color', 'color_description', 'city_district', 'gender_code', 'frame_shape_code', 'size', 'color'] # dropped color as well for simplicity (color_family is used)
data_model = data.drop(columns=unused_columns)

In [7]:
# Encoding for categorical variables
categorical_cols = ['style_name', 'state', 'city', 'color', 'color_family', 'division_name',
                    'frame_construction', 'frame_shape', 'gender', 'brand_name', 'front_material_name',
                    'temple_material_name']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data_model[col] = le.fit_transform(data[col])
    label_encoders[col] = le


In [8]:
# 데이터 분할
X = data_model.drop(columns=['order_qty'])
y = data_model['order_qty']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

# 숫자형 데이터만 선택
numeric_features = X.select_dtypes(include=['number'])

# VIF 계산
vif_data = pd.DataFrame()
vif_data["Feature"] = numeric_features.columns
vif_data["VIF"] = [variance_inflation_factor(numeric_features.values, i) for i in range(len(numeric_features.columns))]

# 결과 출력
print(vif_data)

                 Feature          VIF
0             style_name     7.049072
1                  state     3.954910
2                   city     4.008665
3          temple_length  1116.779594
4           color_family     2.259375
5          lens size (a)   736.918443
6        lens height (b)    16.643462
7     Lens diameter (ed)    33.882383
8      bridge size (dbl)   175.151190
9          circumference    33.683259
10         division_name     4.731710
11    frame_construction    72.302661
12           frame_shape     5.230322
13                gender     8.553733
14          gross_weight     7.142243
15            brand_name     4.815309
16   front_material_name     3.429725
17  temple_material_name     3.483172
18                 color     2.691403


Eliminating high VIF : 'temple_length', 'lens size (a)', 'Lens diameter (ed)', 'frame_construction', 'bridge size (dbl)', 'lens height (b)'

In [10]:
unused_columns = ['temple_length', 'lens size (a)', 'Lens diameter (ed)', 'frame_construction', 'bridge size (dbl)', 'lens height (b)', 'hex_color', 'color_description', 'city_district', 'gender_code', 'frame_shape_code', 'size']
data_model = data.drop(columns=unused_columns)

In [11]:
# Encoding for categorical variables
categorical_cols = ['style_name', 'state', 'city', 'color', 'color_family', 'division_name',
                    'frame_shape', 'gender', 'brand_name', 'front_material_name',
                    'temple_material_name']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data_model[col] = le.fit_transform(data[col])
    label_encoders[col] = le


In [12]:
# 데이터 분할
X = data_model.drop(columns=['order_qty'])
y = data_model['order_qty']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

# 숫자형 데이터만 선택
numeric_features = X.select_dtypes(include=['number'])

# VIF 계산
vif_data = pd.DataFrame()
vif_data["Feature"] = numeric_features.columns
vif_data["VIF"] = [variance_inflation_factor(numeric_features.values, i) for i in range(len(numeric_features.columns))]

# 결과 출력
print(vif_data)

                 Feature       VIF
0             style_name  5.463703
1                  state  3.603240
2                   city  3.618840
3                  color  2.484440
4           color_family  2.228118
5          circumference  2.705253
6          division_name  4.594369
7            frame_shape  4.618901
8                 gender  6.911124
9           gross_weight  5.895913
10            brand_name  4.273564
11   front_material_name  3.333340
12  temple_material_name  3.400072


In [14]:
# Encoding for categorical variables
categorical_cols = ['style_name', 'state', 'city', 'color', 'color_family', 'division_name',
                    'frame_shape', 'gender', 'brand_name', 'front_material_name',
                    'temple_material_name']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data_model[col] = le.fit_transform(data[col])
    label_encoders[col] = le


In [15]:
data_model

Unnamed: 0,style_name,state,city,color,color_family,circumference,division_name,frame_shape,gender,gross_weight,brand_name,front_material_name,temple_material_name,order_qty
0,782,30,1387,1,0,0.00,1,3,1,141.75,1,5,5,2610
1,782,30,1387,4,0,0.00,1,3,1,141.75,1,5,5,2428
2,2,12,1871,130,2,163.30,1,3,2,226.80,11,0,0,2300
3,1196,12,1871,98,2,163.70,2,9,5,141.75,6,8,7,2146
4,501,12,1871,114,2,0.00,1,2,2,226.80,15,0,0,2101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647609,1210,37,2156,11,0,330.00,2,8,0,141.75,6,8,7,0
647610,1210,37,2156,53,7,330.00,2,8,0,141.75,6,8,7,0
647611,1256,49,3122,78,2,144.92,1,2,2,226.80,7,15,16,0
647612,297,40,630,78,2,0.00,0,5,2,56.70,3,15,16,0


In [16]:
from sklearn.preprocessing import StandardScaler

numerical_cols = ['circumference', 'gross_weight', 'order_qty']  # 수치형 변수 리스트
scaler = StandardScaler()
data_model[numerical_cols] = scaler.fit_transform(data_model[numerical_cols])

In [17]:
correlations = data_model.corr()
print(correlations['order_qty'])  # 타겟 변수와의 상관 관계

style_name              0.001295
state                   0.003295
city                    0.004925
color                   0.000311
color_family           -0.000848
circumference           0.001467
division_name           0.007874
frame_shape             0.002557
gender                 -0.001920
gross_weight           -0.000928
brand_name             -0.001131
front_material_name    -0.004187
temple_material_name   -0.004256
order_qty               1.000000
Name: order_qty, dtype: float64


In [18]:
data_model.head()

Unnamed: 0,style_name,state,city,color,color_family,circumference,division_name,frame_shape,gender,gross_weight,brand_name,front_material_name,temple_material_name,order_qty
0,782,30,1387,1,0,-1.189566,1,3,1,-0.152829,1,5,5,180.341224
1,782,30,1387,4,0,-1.189566,1,3,1,-0.152829,1,5,5,167.758209
2,2,12,1871,130,2,0.91288,1,3,2,0.984843,11,0,0,158.908617
3,1196,12,1871,98,2,0.91803,2,9,5,-0.152829,6,8,7,148.26145
4,501,12,1871,114,2,-1.189566,1,2,2,0.984843,15,0,0,145.150265


### RandomForest Model

In [19]:
from sklearn.metrics import mean_absolute_error, r2_score

# 데이터 분할
X = data_model.drop(columns=['order_qty'])
y = data_model['order_qty']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 🚀 RandomForest 모델 학습 ###
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 예측 및 평가
rf_y_pred = rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)

print(f'🎯 RandomForest Model')
print(f'MAE: {rf_mae:.4f}')
print(f'R² Score (Accuracy): {rf_r2:.4f}\n')

🎯 RandomForest Model
MAE: 0.0554
R² Score (Accuracy): 0.3028



In [20]:
# 높은 주문 수량을 기대할 수 있는 프레임 조합 추천
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
important_features = feature_importances.sort_values(ascending=False)
print("🔥 RandomForest Feature Importance:")
print(important_features.head(10))


🔥 RandomForest Feature Importance:
city                    0.402570
style_name              0.151873
color                   0.148713
color_family            0.082760
state                   0.071195
circumference           0.037287
frame_shape             0.025639
gender                  0.024455
temple_material_name    0.021392
front_material_name     0.011525
dtype: float64


### XGBoost

In [21]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor(
    n_estimators=500, 
    learning_rate=0.05, 
    max_depth=6, 
    subsample=0.8, 
    colsample_bytree=0.8, 
    early_stopping_rounds=50,  # ✅ 객체 생성 시 설정
    eval_metric="mae",  # ✅ 객체 생성 시 설정
    random_state=42
)

# 모델 학습
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=10)

# 예측 및 평가
xgb_y_pred = xgb_model.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_y_pred)
xgb_r2 = r2_score(y_test, xgb_y_pred)

print(f'🎯 XGBoost Model')
print(f'MAE: {xgb_mae:.4f}')
print(f'R² Score (Accuracy): {xgb_r2:.4f}')

[0]	validation_0-mae:0.06573
[10]	validation_0-mae:0.06471
[20]	validation_0-mae:0.06418
[30]	validation_0-mae:0.06408
[40]	validation_0-mae:0.06359
[50]	validation_0-mae:0.06351
[60]	validation_0-mae:0.06340
[70]	validation_0-mae:0.06342
[80]	validation_0-mae:0.06329
[90]	validation_0-mae:0.06322
[100]	validation_0-mae:0.06326
[110]	validation_0-mae:0.06331
[120]	validation_0-mae:0.06336
[130]	validation_0-mae:0.06363
[140]	validation_0-mae:0.06373
[144]	validation_0-mae:0.06382
🎯 XGBoost Model
MAE: 0.0632
R² Score (Accuracy): 0.1061


In [22]:
# Feature Importance (XGBoost)
xgb_feature_importances = pd.Series(xgb_model.feature_importances_, index=X.columns)
xgb_important_features = xgb_feature_importances.sort_values(ascending=False)
print("🔥 XGBoost Feature Importance:")
print(xgb_important_features.head(10))

🔥 XGBoost Feature Importance:
temple_material_name    0.138534
city                    0.132818
gross_weight            0.108030
color                   0.094982
state                   0.094061
color_family            0.092745
gender                  0.084974
style_name              0.061443
brand_name              0.061254
circumference           0.048700
dtype: float32
