In [2]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import time
import pandas as pd
import xgboost as xgb

In [5]:
# 데이터 가져오기
data = pd.read_csv('/Users/sangjilee/Documents/spring-2024/data-science-computing/week7_to_15/week13/laptop_price.csv', encoding='latin1')

# Price_euros 컬럼 값을 이용하여 Target 컬럼 생성
# Price_euros 컬럼 값이 600 이상 700 이하인 경우 1, 아닌 경우 0을 Target 컬럼에 할당
data['Target'] = data['Price_euros'].apply(lambda x: 1 if 600 <= x <= 700 else 0)

In [6]:
# 데이터의 클래스 불균형 해결을 위해 minority 클래스를 oversampling
# Target값이 0인 데이터(다수 클래스)와 1인 데이터(소수 클래스)를 분리
df_majority = data[data.Target == 0]
df_minority = data[data.Target == 1]


df_minority_upsampled = resample(df_minority,               # resample 함수를 이용하여 minority 클래스를 oversampling
                                 replace=True,              # 복원 추출
                                 n_samples=len(df_majority),# 다수 클래스의 샘플 수와 동일한 수로 샘플링하여 소수 클래스, 다수 클래스 데이터 수를 동일하게 맞춤    
                                 random_state=42)           # 결과를 재현가능하도록 하기 위한 random_state 설정

In [7]:
# 다수 클래스 데이터와 oversampling된 소수 클래스 데이터를 합쳐서 데이터 불균형 문제 해결 (균형 잡힌 데이터셋을 만듦)
data_balanced = pd.concat([df_majority, df_minority_upsampled])

# 데이터셋을 Feature(X), Target(y)로 나눔
X = data_balanced.drop(columns=['laptop_ID', 'Price_euros', 'Target'])
y = data_balanced['Target']

In [8]:
# 데이터프레임 X의 범주형(문자열) 데이터를 숫자형 데이터로 변환
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])

In [9]:
# train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 하이퍼파라미터 조합을 정의하는 딕셔너리
# key는 하이퍼파라미터 이름, value는 탐색할 값들의 리스트
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

In [10]:
# GridSearchCV을 수행, 최적의 하이퍼파라미터를 찾음
grid_search = GridSearchCV(estimator=xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), 
                           param_grid=param_grid, 
                           cv=3, 
                           n_jobs=-1, 
                           verbose=2)

start_time = time.time()
grid_search.fit(X_train, y_train)
training_time = time.time() - start_time

Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END 

In [14]:
# 최적의 모델을 best_model에 할당
best_model = grid_search.best_estimator_

# 테스트 데이터에 대한 예측값을 y_pred에 할당
y_pred = best_model.predict(X_test)

# accuracy_score 함수를 이용하여 y_test와 y_pred의 정확도를 계산
accuracy = accuracy_score(y_test, y_pred)

# classification_report 함수를 이용하여 y_test와 y_pred의 classification report를 생성
report = classification_report(y_test, y_pred)

# 결과 출력
print("Processed Laptop Data:")
print(data_balanced.head())

Processed Laptop Data:
   laptop_ID Company      Product   TypeName  Inches  \
0          1   Apple  MacBook Pro  Ultrabook    13.3   
1          2   Apple  Macbook Air  Ultrabook    13.3   
2          3      HP       250 G6   Notebook    15.6   
3          4   Apple  MacBook Pro  Ultrabook    15.4   
4          5   Apple  MacBook Pro  Ultrabook    13.3   

                     ScreenResolution                         Cpu   Ram  \
0  IPS Panel Retina Display 2560x1600        Intel Core i5 2.3GHz   8GB   
1                            1440x900        Intel Core i5 1.8GHz   8GB   
2                   Full HD 1920x1080  Intel Core i5 7200U 2.5GHz   8GB   
3  IPS Panel Retina Display 2880x1800        Intel Core i7 2.7GHz  16GB   
4  IPS Panel Retina Display 2560x1600        Intel Core i5 3.1GHz   8GB   

                Memory                           Gpu  OpSys  Weight  \
0            128GB SSD  Intel Iris Plus Graphics 640  macOS  1.37kg   
1  128GB Flash Storage        Intel HD Graphics

In [15]:
print(f"Training time: {training_time:.2f} seconds")
print(f"Accuracy: {accuracy:.2%}")
print("Classification Report:")
print(report)

print("Best Hyperparameters:")
print(grid_search.best_params_)

Training time: 10.29 seconds
Accuracy: 96.70%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97       254
           1       0.94      1.00      0.97       231

    accuracy                           0.97       485
   macro avg       0.97      0.97      0.97       485
weighted avg       0.97      0.97      0.97       485

Best Hyperparameters:
{'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 300, 'subsample': 1.0}
