In [2]:
!pip install pandas numpy scikit-learn lightgbm tqdm




In [3]:
from google.colab import drive
drive.mount('/content/drive')


DATA_PATH = '/content/drive/MyDrive/68e8d1d70b66d_student_resource/student_resource/dataset/'


Mounted at /content/drive


In [4]:
import pandas as pd

train_df = pd.read_csv(DATA_PATH + 'train.csv')
test_df = pd.read_csv(DATA_PATH + 'test.csv')

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
train_df.head()


Train shape: (75000, 4)
Test shape: (75000, 3)


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [5]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb


In [6]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9 ]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df['clean_text'] = train_df['catalog_content'].apply(clean_text)
test_df['clean_text'] = test_df['catalog_content'].apply(clean_text)

print("✅ Text cleaned successfully!")


✅ Text cleaned successfully!


In [7]:

tfidf = TfidfVectorizer(max_features=8000, ngram_range=(1,2))
X_train = tfidf.fit_transform(train_df['clean_text'])
X_test = tfidf.transform(test_df['clean_text'])
y_train = train_df['price']

print("✅ TF-IDF vectorization complete!")
print("X_train shape:", X_train.shape)


✅ TF-IDF vectorization complete!
X_train shape: (75000, 8000)


In [8]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print("Train/Val split done:", X_tr.shape, X_val.shape)


Train/Val split done: (60000, 8000) (15000, 8000)


In [10]:
import lightgbm as lgb

model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    device='gpu'
)


In [11]:
from lightgbm import early_stopping, log_evaluation

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric='mae',
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(100)]
)


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1113617
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 8000
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 24 dense feature groups (1.37 MB) transferred to GPU in 0.003137 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 23.598634
Training until validation scores don't improve for 50 rounds
[100]	valid_0's l1: 13.657	valid_0's l2: 1112.56
[200]	valid_0's l1: 13.2586	valid_0's l2: 1087.58
[300]	valid_0's l1: 13.0827	valid_0's l2: 1078.01
[400]	valid_0's l1: 12.9901	valid_0's l2: 1072.95
[500]	valid_0's l1: 12.9273	valid_0's l2: 1069.56
[600]	valid_0's l1: 12.8836	valid_0's l2: 1067.62
Early stopping, best iteration is:
[593]	valid_0's l1: 12.8849	valid_

In [12]:

y_pred = model.predict(X_test, num_iteration=model.best_iteration_)

print("Predictions shape:", y_pred.shape)
print(y_pred[:10])




Predictions shape: (75000,)
[17.03082193 30.25026812 23.20335596 18.09115205 53.84900396  6.16872644
 11.19034813 14.40754034 20.15985586  5.16734037]


In [14]:
print(test_df.columns.tolist())


['sample_id', 'catalog_content', 'image_link', 'clean_text']


In [15]:
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': y_pred
})

submission.to_csv('submission.csv', index=False)
print("✅ submission.csv created successfully!")
submission.head()


✅ submission.csv created successfully!


Unnamed: 0,sample_id,price
0,100179,17.030822
1,245611,30.250268
2,146263,23.203356
3,95658,18.091152
4,36806,53.849004


In [16]:
from google.colab import files
files.download("submission.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>