In [28]:
import csv
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from tqdm import tqdm  # For progress monitoring

import warnings
warnings.filterwarnings('ignore')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Set the display option to show the entire column value
pd.set_option('display.max_colwidth', None)

In [5]:
# Load the CSV file to ensure it loads correctly
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AIFFEL_DATATHONE(2조)/train_df_processed.csv', encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)

In [6]:
train_df.isnull().sum()

train_id             0
name                 0
item_condition_id    0
category_name        0
brand_name           0
price                0
shipping             0
item_description     0
category_1           0
category_2           0
category_3           0
combined_text        0
dtype: int64

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1392726 entries, 0 to 1392725
Data columns (total 12 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   train_id           1392726 non-null  float64
 1   name               1392726 non-null  object 
 2   item_condition_id  1392726 non-null  float64
 3   category_name      1392726 non-null  object 
 4   brand_name         1392726 non-null  object 
 5   price              1392726 non-null  float64
 6   shipping           1392726 non-null  float64
 7   item_description   1392726 non-null  object 
 8   category_1         1392726 non-null  object 
 9   category_2         1392726 non-null  object 
 10  category_3         1392726 non-null  object 
 11  combined_text      1392726 non-null  object 
dtypes: float64(4), object(8)
memory usage: 127.5+ MB


In [8]:
train_df.describe().apply(lambda s: s.apply('{0:.5f}'.format))

Unnamed: 0,train_id,item_condition_id,price,shipping
count,1392726.0,1392726.0,1392726.0,1392726.0
mean,741315.39926,1.90187,27.04392,0.44993
std,427946.35441,0.90459,39.07712,0.49749
min,1.0,1.0,3.0,0.0
25%,370620.25,1.0,11.0,0.0
50%,741368.5,2.0,17.0,0.0
75%,1111926.75,3.0,30.0,1.0
max,1482534.0,5.0,2009.0,1.0


In [9]:
# Feature engineering for 'item_description'

# List of phrases
phrases = [
    'brand new', 'never opened', 'with tag', 'new in box', 'great condition',
    'certificate of authenticity', 'complete set', 'worn once', 'great condition',
    'no stains', 'like new'
]

# Create dummy columns for each phrase
for phrase in phrases:
    column_name = phrase.replace(' ', '_')  # Replace spaces with underscores for column names
    train_df[column_name] = train_df['combined_text'].str.contains(phrase).astype(int)

In [10]:
train_df['item_desc_len'] = train_df['item_description'].str.len()

In [11]:
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_1,category_2,...,never_opened,with_tag,new_in_box,great_condition,certificate_of_authenticity,complete_set,worn_once,no_stains,like_new,item_desc_len
0,1.0,razer blackwidow chroma keyboard,3.0,Electronics/Computers & Tablets/Components & Parts,razer,52.0,0.0,this keyboard is in great condition and works like it came out of the box all of the ports are tested and work perfectly the lights are customizable via the razer synapse app on your pc,electronics,computers tablets,...,0,0,0,1,0,0,0,0,0,185
1,2.0,avaviv blouse,1.0,Women/Tops & Blouses/Blouse,target,10.0,1.0,adorable top with a hint of lace and a key hole in the back the pale pink is a 1x and i also have a 3x available in white,women,tops blouses,...,0,0,0,0,0,0,0,0,0,121
2,3.0,leather horse statues,1.0,Home/Home Décor/Home Décor Accents,unknown,35.0,1.0,new with tags leather horses retail for rm each stand about a foot high they are being sold as a pair any questions please ask free shipping just got out of storage,home,home décor,...,0,1,0,0,0,0,0,0,0,164
3,4.0,24k gold plated rose,1.0,Women/Jewelry/Necklaces,unknown,44.0,0.0,complete with certificate of authenticity,women,jewelry,...,0,0,0,0,1,0,0,0,0,41
4,5.0,bundled items requested for ruie,3.0,Women/Other/Other,banana republic,59.0,0.0,banana republic bottoms candies skirt with matching blazeramy byers suit loft bottoms and cami top,women,other,...,0,0,0,0,0,0,0,0,0,98


In [12]:
train_df.drop(['train_id', 'name', 'category_name', 'item_description', 'combined_text'], axis=1, inplace=True)

In [13]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1392726 entries, 0 to 1392725
Data columns (total 18 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   item_condition_id            1392726 non-null  float64
 1   brand_name                   1392726 non-null  object 
 2   price                        1392726 non-null  float64
 3   shipping                     1392726 non-null  float64
 4   category_1                   1392726 non-null  object 
 5   category_2                   1392726 non-null  object 
 6   category_3                   1392726 non-null  object 
 7   brand_new                    1392726 non-null  int64  
 8   never_opened                 1392726 non-null  int64  
 9   with_tag                     1392726 non-null  int64  
 10  new_in_box                   1392726 non-null  int64  
 11  great_condition              1392726 non-null  int64  
 12  certificate_of_authenticity  1392726 non-n

In [14]:
# Label encoding for categorical columns

columns_to_encode = ['brand_name', 'category_1', 'category_2', 'category_3']

le = LabelEncoder()

for col in columns_to_encode:
    train_df[col] = le.fit_transform(train_df[col])

In [None]:
train_df.head()

Unnamed: 0,item_condition_id,brand_name,price,shipping,category_1,category_2,category_3,brand_new,never_opened,with_tag,new_in_box,great_condition,certificate_of_authenticity,complete_set,worn_once,no_stains,like_new
0,3.0,3531,52.0,0.0,1,30,211,0,0,0,0,1,0,0,0,0,0
1,1.0,4149,10.0,1.0,9,102,94,0,0,0,0,0,0,0,0,0,0
2,1.0,4429,35.0,1.0,3,55,403,0,0,1,0,0,0,0,0,0,0
3,1.0,4429,44.0,0.0,9,58,534,0,0,0,0,0,1,0,0,0,0
4,3.0,418,59.0,0.0,9,72,557,0,0,0,0,0,0,0,0,0,0


In [15]:
# X, y 지정
X = train_df.drop('price', axis=1)
y = train_df['price']

In [16]:
X.shape, y.shape

((1392726, 17), (1392726,))

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

In [18]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1114180, 17), (278546, 17), (1114180,), (278546,))

In [19]:
# 평가지표 정의하기

def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle

In [22]:
# 모델 학습
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

In [24]:
# 예측
y_pred_train_xgb = model_xgb.predict(X_train)
y_pred_test_xgb = model_xgb.predict(X_test)

In [27]:
# RMSLE 계산
rmsle_train_xgb = rmsle(y_train, y_pred_train_xgb)
rmsle_test_xgb = rmsle(y_test, y_pred_test_xgb)

print(f"Train RMSLE (XGBoost): {rmsle_train_xgb:.4f}")
print(f"Test RMSLE (XGBoost): {rmsle_test_xgb:.4f}")

Train RMSLE (XGBoost): 0.6133
Test RMSLE (XGBoost): 0.6163


In [30]:
def get_top_error_data(y_Test, pred, n_tops = 5):
    result_df = pd.DataFrame(y_test.values, columns=['y_test'])
    result_df['predicted_value'] = np.round(pred)
    result_df['diff'] = np.abs(result_df['y_test'] - result_df['predicted_value'])

    print(result_df.sort_values('diff', ascending=False)[:n_tops])

get_top_error_data(y_test, pred, n_tops=30)

        y_test  predicted_value    diff
159806  1750.0             33.0  1717.0
149129  1815.0            159.0  1656.0
114940  1700.0            169.0  1531.0
24217   1500.0             19.0  1481.0
246996  1400.0             19.0  1381.0
66116   1354.0             46.0  1308.0
35275   1230.0             79.0  1151.0
60742   1600.0            451.0  1149.0
183592  1209.0            146.0  1063.0
52182   1209.0            177.0  1032.0
132870  1209.0            182.0  1027.0
166374  1150.0            157.0   993.0
108539  1206.0            221.0   985.0
158873  1000.0             28.0   972.0
62640   1299.0            334.0   965.0
52990   1004.0             39.0   965.0
244825   980.0             27.0   953.0
192067  1295.0            367.0   928.0
68872   1256.0            339.0   917.0
267863  1209.0            297.0   912.0
122809  1165.0            262.0   903.0
147715  1106.0            228.0   878.0
186361   906.0             28.0   878.0
20716   1050.0            183.0   867.0


In [31]:
# 하이퍼파라미터 튜닝
params = {'learning_rate' : [0.07, 0.05],
         'max_depth' : [3, 5, 7],
         'n_estimators' : [100, 200],
         'subsample' : [0.9, 0.8, 0.7]
         }

In [32]:
# X, y 재정의
X = train_df.drop('price', axis=1)
y = train_df['price']

In [34]:
# GridSearchCV
model_xgb_grid = XGBRegressor()
grid = GridSearchCV(model_xgb_grid, params, cv=3, n_jobs=-1)
grid.fit(X, y)

In [35]:
# 최적의 하이퍼파라미터를 찾음
grid.best_params_

{'learning_rate': 0.07, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.7}

In [37]:
# 하이퍼파라미터 튜닝 및 모델 학습
model_xgb_grid_tuned = XGBRegressor(
    learning_rate=0.07,
    max_depth=7,
    n_estimators=200,
    subsample=0.7
)

model_xgb_grid_tuned.fit(X_train, y_train)

In [38]:
# 예측
y_pred_train_xgb_tuned = model_xgb_grid_tuned.predict(X_train)
y_pred_test_xgb_tuned = model_xgb_grid_tuned.predict(X_test)

In [39]:
# RMSLE 계산
rmsle_train_xgb_tuned = rmsle(y_train, y_pred_train_xgb_tuned)
rmsle_test_xgb_tuned = rmsle(y_test, y_pred_test_xgb_tuned)

print(f"Train RMSLE (XGBoost tuned): {rmsle_train_xgb_tuned:.4f}")
print(f"Test RMSLE (XGBoost tuned): {rmsle_test_xgb_tuned:.4f}")

Train RMSLE (XGBoost tuned): 0.6125
Test RMSLE (XGBoost tuned): 0.6154
