In [2]:
import csv
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm  # For progress monitoring

import warnings
warnings.filterwarnings('ignore')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Set the display option to show the entire column value
pd.set_option('display.max_colwidth', None)

In [5]:
# Load the CSV file to ensure it loads correctly
# train_df = pd.read_csv('/content/drive/MyDrive/AIFFEL_DATATHONE(2조)/train_df_processed.csv', encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AIFFEL_DATATHONE(2조)/train_df_processed.csv', encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)

In [6]:
train_df.isnull().sum()

train_id             0
name                 0
item_condition_id    0
category_name        0
brand_name           0
price                0
shipping             0
item_description     0
category_1           0
category_2           0
category_3           0
combined_text        0
dtype: int64

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1392726 entries, 0 to 1392725
Data columns (total 12 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   train_id           1392726 non-null  float64
 1   name               1392726 non-null  object 
 2   item_condition_id  1392726 non-null  float64
 3   category_name      1392726 non-null  object 
 4   brand_name         1392726 non-null  object 
 5   price              1392726 non-null  float64
 6   shipping           1392726 non-null  float64
 7   item_description   1392726 non-null  object 
 8   category_1         1392726 non-null  object 
 9   category_2         1392726 non-null  object 
 10  category_3         1392726 non-null  object 
 11  combined_text      1392726 non-null  object 
dtypes: float64(4), object(8)
memory usage: 127.5+ MB


In [8]:
# train_df.describe().apply(lambda s: s.apply('{0:.5f}'.format))

In [None]:
# Feature engineering for 'item_description'

# List of phrases
phrases = [
    'brand new', 'never opened', 'with tag', 'new in box', 'great condition',
    'certificate of authenticity', 'complete set', 'worn once', 'great condition',
    'no stains', 'like new'
]

# Create dummy columns for each phrase
for phrase in phrases:
    column_name = phrase.replace(' ', '_')  # Replace spaces with underscores for column names
    train_df[column_name] = train_df['combined_text'].str.contains(phrase).astype(int)

In [9]:
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_1,category_2,category_3,combined_text
0,1.0,razer blackwidow chroma keyboard,3.0,Electronics/Computers & Tablets/Components & Parts,razer,52.0,0.0,this keyboard is in great condition and works like it came out of the box all of the ports are tested and work perfectly the lights are customizable via the razer synapse app on your pc,electronics,computers tablets,components parts,razer blackwidow chroma keyboard this keyboard is in great condition and works like it came out of the box all of the ports are tested and work perfectly the lights are customizable via the razer synapse app on your pc
1,2.0,avaviv blouse,1.0,Women/Tops & Blouses/Blouse,target,10.0,1.0,adorable top with a hint of lace and a key hole in the back the pale pink is a 1x and i also have a 3x available in white,women,tops blouses,blouse,avaviv blouse adorable top with a hint of lace and a key hole in the back the pale pink is a 1x and i also have a 3x available in white
2,3.0,leather horse statues,1.0,Home/Home Décor/Home Décor Accents,unknown,35.0,1.0,new with tags leather horses retail for rm each stand about a foot high they are being sold as a pair any questions please ask free shipping just got out of storage,home,home décor,home décor accents,leather horse statues new with tags leather horses retail for rm each stand about a foot high they are being sold as a pair any questions please ask free shipping just got out of storage
3,4.0,24k gold plated rose,1.0,Women/Jewelry/Necklaces,unknown,44.0,0.0,complete with certificate of authenticity,women,jewelry,necklaces,24k gold plated rose complete with certificate of authenticity
4,5.0,bundled items requested for ruie,3.0,Women/Other/Other,banana republic,59.0,0.0,banana republic bottoms candies skirt with matching blazeramy byers suit loft bottoms and cami top,women,other,other,bundled items requested for ruie banana republic bottoms candies skirt with matching blazeramy byers suit loft bottoms and cami top


In [None]:
train_df.drop(['train_id', 'name', 'category_name', 'item_description', 'combined_text'], axis=1, inplace=True)

In [None]:
# Label encoding for categorical columns

columns_to_encode = ['brand_name', 'category_1', 'category_2', 'category_3']

le = LabelEncoder()

for col in columns_to_encode:
    train_df[col] = le.fit_transform(train_df[col])

In [None]:
train_df.head()

In [None]:
# X, y 지정
X = train_df.drop('price', axis=1)
y = train_df['price']

In [None]:
X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# 평가지표 정의하기

def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle_value = np.sqrt(np.mean(squared_error))
    return rmsle_value

In [None]:
# 모델 학습
model_rf = RandomForestRegressor(n_estimators=100, random_state=0)
model_rf.fit(X_train, y_train)

In [None]:
# 예측
y_pred_train_rf = model_rf.predict(X_train)
y_pred_test_rf = model_rf.predict(X_test)

In [None]:
# # RMSLE 계산 1
# rmsle_train_rf = rmsle(y_train, y_pred_train_rf)
# rmsle_test_rf = rmsle(y_test, y_pred_test_rf)

# print(f"Train RMSLE (Random Forest): {rmsle_train_rf:.4f}")
# print(f"Test RMSLE (Random Forest): {rmsle_test_rf:.4f}")

In [None]:
# 평가지표 재정의, RMSLE 계산 2
rmsle_train_rf = rmsle(y_train, y_pred_train_rf)
rmsle_test_rf = rmsle(y_test, y_pred_test_rf)

print(f"Train RMSLE (Random Forest): {rmsle_train_rf:.4f}")
print(f"Test RMSLE (Random Forest): {rmsle_test_rf:.4f}")

In [None]:
# 예측 결과 DataFrame 생성/ 결과 2
result_df_rf = pd.DataFrame({
    'y_test': y_test,
    'predicted_value': y_pred_test_rf,
    'diff': y_test - y_pred_test_rf
})

print(result_df_rf.head(10))

# model save

In [None]:
# 학습된 모델 저장
import pickle

# 구글 드라이브 저장 경로
model_path = '/content/drive/MyDrive/AIFFEL_DATATHONE(2조)/random_forest_model.pkl'

with open(model_path, 'wb') as f:
    pickle.dump(model_rf, f)

# 추천 시스템

In [None]:
import pickle
# 모델 준비
def load_model():
    with open('/content/drive/MyDrive/AIFFEL_DATATHONE(2조)/random_forest_model.pkl', 'rb') as f:
        model = pickle.load(f)
    return model

# 모델 불러오기
model = load_model()

## for Seller

In [None]:
# 초기화 메서드(상태, 카테고리, 설명)
class Product:
    def __init__(self, name, item_condition_id, category_name, brand_name, item_description):
        self.name = name
        self.item_condition_id = item_condition_id
        self.category_name = category_name
        self.brand_name = brand_name
        self.item_description = item_description

    # 가격 예측 메서드(새로운 제품 정보를 모델에 넣기 위한 포맷)
    def get_price_estimate(self, model):
        product_info = {
            'name': self.name,
            'item_condition_id': self.item_condition_id,
            'category_name': self.category_name,
            'brand_name': self.brand_name if self.brand_name else 'unknown',
            'item_description': self.item_description
        }
        input_data = pd.DataFrame([product_info])

        # 모델의 전처리기를 다시 사용하여 입력 데이터 전처리
        preprocessor = model.named_steps['preprocessor']
        input_data_preprocessed = preprocessor.transform(input_data)

        # 모델을 사용해 가격 예측
        regressor = model.named_steps['regressor']
        price_estimate = regressor.predict(input_data_preprocessed)

        # 예측 가격 반환
        return price_estimate[0]

# 카테고리 옵션 제공
def get_category_options(data, category_level):
    return sorted(data[f'category_{category_level}'].unique())

In [None]:
# # 데이터셋 train + test?
# train_df = pd.read_csv('/content/drive/MyDrive/AIFFEL_DATATHONE(2조)/train_df_processed.csv')
# test_df =

In [None]:
# 데이터 컬럼 정리 시도를 위해 데이터 재로드
import pandas as pd
train_df_test = pd.read_csv('/content/drive/MyDrive/AIFFEL_DATATHONE(2조)/train_df_processed.csv')

#카테고리 데이터 전처리
# train_df_test['category_name'] = train_df_test['category_1'] + "/" + train_df_test['category_2'] + "/" + train_df_test['category_3']

In [None]:
train_df_test.info()

In [None]:
# main 함수 정의 : 코드1
def main():
    data = train_df_test
    model = load_model()

    while True:
        print("\n1. 제품 등록하기")
        print("2. 종료")
        choice = input("원하는 서비스를 선택하세요: ")

        if choice == '1':
            category_1_options = get_category_options(data, 'category_1')
            print("Category 1 옵션: ", category_1_options)
            category_1 = input("Category 1을 선택하세요: ")

            category_2_options = get_category_options(data[data['category_1'] == category_1], 'category_2')
            print("Category 2 옵션: ", category_2_options)
            category_2 = input("Category 2를 선택하세요: ")

            category_3_options = get_category_options(data[(data['category_1'] == category_1) & (data['category_2'] == category_2)], 'category_3')
            print("Category 3 옵션: ", category_3_options)
            category_3 = input("Category 3를 선택하세요: ")

            name = input("제품 이름을 입력하세요: ")
            item_condition_id = int(input("제품 상태 (1-5)를 입력하세요: "))
            brand_name = input("브랜드 이름을 입력하세요 (모르는 경우 빈 칸으로 두세요): ")
            item_description = input("제품 설명을 입력하세요: ")

            category_name = f"{category_1}/{category_2}/{category_3}"
            product = Product(name, item_condition_id, category_name, brand_name, item_description)
            price_estimate = product.get_price_estimate(model)
            print(f"추천 판매 가격: {price_estimate:.2f}원")

        elif choice == '2':
            print("추천 서비스를 종료합니다.")
            break
        else:
            print("잘못된 입력입니다. 다시 시도해주세요.")

if __name__ == "__main__":
    main()

In [None]:
# # main 함수 정의 : 코드 2
# def main():
#     data = train_df_test
#     model = load_model()

#     while True:
#         print("\n1. 제품 등록하기")
#         print("2. 종료")
#         choice = input("원하는 서비스를 선택하세요: ")

#         if choice == '1':
#             category_1_options = get_category_options(data, 'category_1')
#             print("Category 1 옵션: ", category_1_options)
#             category_1 = input("Category 1을 선택하세요: ")

#             category_2_options = get_category_options(data[data['category_1'] == category_1], 'category_2')
#             print("Category 2 옵션: ", category_2_options)
#             category_2 = input("Category 2를 선택하세요: ")

#             category_3_options = get_category_options(data[(data['category_1'] == category_1) & (data['category_2'] == category_2)], 'category_3')
#             print("Category 3 옵션: ", category_3_options)
#             category_3 = input("Category 3를 선택하세요: ")

#             name = input("제품 이름을 입력하세요: ")
#             item_condition_id = int(input("제품 상태 (1-5)를 입력하세요: "))
#             brand_name = input("브랜드 이름을 입력하세요 (모르는 경우 빈 칸으로 두세요): ")
#             item_description = input("제품 설명을 입력하세요: ")

#             category_name = f"{category_1}/{category_2}/{category_3}"
#             product = Product(name, item_condition_id, category_name, brand_name, item_description)
#             price_estimate = product.get_price_estimate(model)
#             print(f"추천 판매 가격: {price_estimate:.2f}원")

#         elif choice == '2':
#             print("추천 서비스를 종료합니다.")
#             break
#         else:
#             print("잘못된 입력입니다. 다시 시도해주세요.")

# if __name__ == "__main__":
#     main()

## for Buyer

In [None]:
def load_data():
    return pd.read_csv('모델학습/평가에 사용했던, 추천시스템에 사용할 데이터셋')

# 특정 카테고리 레벨에서 사용 가능한 옵션
def get_category_options(data, category_level):
    return sorted(data[f'category_{category_level}'].unique())

# 제품 추천
def recommend_products(data, category_1, category_2, category_3, item_condition_id, desired_price):
    filtered_data = data[
        (data['category_1'] == category_1) &
        (data['category_2'] == category_2) &
        (data['category_3'] == category_3) &
        (data['item_condition_id'] == item_condition_id)
    ]

    # 가격 차이를 계산하여 오름차순으로 정렬해 3개 제품 반환
    filtered_data['price_difference'] = abs(filtered_data['price'] - desired_price)
    recommended_products = filtered_data.sort_values(by='price_difference').head(3)

    return recommended_products

In [None]:
# main 함수 정의
def main():
    data = load_data()

    while True:
        print("\n1. 제품 추천 받기")
        print("2. 종료")
        choice = input("원하는 서비스를 선택하세요: ")

        if choice == '1':
            category_1_options = get_category_options(data, 1)
            print("Category 1 옵션: ", category_1_options)
            category_1 = input("Category 1을 선택하세요: ")

            category_2_options = get_category_options(data[data['category_1'] == category_1], 2)
            print("Category 2 옵션: ", category_2_options)
            category_2 = input("Category 2를 선택하세요: ")

            category_3_options = get_category_options(data[(data['category_1'] == category_1) & (data['category_2'] == category_2)], 3)
            print("Category 3 옵션: ", category_3_options)
            category_3 = input("Category 3를 선택하세요: ")

            item_condition_id = int(input("제품 상태 (1-5)를 선택하세요: "))
            desired_price = float(input("원하는 가격을 입력하세요: "))   # 가격 범위로 선택하게?

            recommended_products = recommend_products(data, category_1, category_2, category_3, item_condition_id, desired_price)
            print("\n추천된 제품:")
            for idx, product in recommended_products.iterrows():
                print(f"제품 이름: {product['item_name']}, 가격: {product['price']}, 설명: {product['item_description']}")

        elif choice == '2':
            print("프로그램을 종료합니다.")
            break
        else:
            print("잘못된 입력입니다. 다시 시도해주세요.")

if __name__ == "__main__":
    main()

# 옵션을 모두 선택해야 하는 것인지.

# xg

In [None]:
# 결과 1
# print(result_df.head(10))

In [None]:
# model = XGBRegressor()
# model.fit(X_train, y_train)
# pred = model.predict(X_test)
# rmsle(y_test, pred)

In [None]:
# def get_top_error_data(y_Test, pred, n_tops = 5):
#     result_df = pd.DataFrame(y_test.values, columns=['y_test'])
#     result_df['predicted_value'] = np.round(pred)
#     result_df['diff'] = np.abs(result_df['y_test'] - result_df['predicted_value'])

#     print(result_df.sort_values('diff', ascending=False)[:n_tops])

# get_top_error_data(y_test, pred, n_tops=10)