In [1]:
!pip install optuna
!pip install catboost

In [2]:
# Initial Settings
from google.colab import drive
drive.mount('/content/drive')

%cd drive/MyDrive/Colab_Notebooks/DMLab/RC_System

In [3]:
# Library Import
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import optuna
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

# Seed
Seed = 42

#**<font color='white'>< Functions ></font>**

In [4]:
# Data preprocessor
def preprocessor(df) :
  # Categorizing 'Age'
  df['Age'] = df['Age'].apply(lambda x : 30 if x == 0 or x >= 100 else x // 10 * 10)
  df['Gen'] = df['Age'].astype(int).astype(str) + ' Gen'

  # Split 'Location' into new features
  df['City'] = df['Location'].str.split(',').str[0]
  df['State'] = df['Location'].str.split(',').str[1]
  # String Manipulation
  pattern = r'[^\w\s\d]'
  df['City'] = df['City'].apply(lambda x : re.sub(pattern, '', x))
  df['State'] = df['State'].apply(lambda x : re.sub(pattern, '', x))
  df['City'] = df['City'].apply(lambda x : x.strip())
  df['State'] = df['State'].apply(lambda x : x.strip())
  # Handling the missing values
  df.loc[(df['City'] == 'na') | (df['City'] == ''), 'City'] = 'unknown'
  df.loc[(df['State'] == 'na') | (df['State'] == ''), 'State'] = 'unknown'

  # Manipulating 'Book-Author'
  df['Book-Author'] = df['Book-Author'].apply(lambda x : re.sub(pattern, '', x))
  df['Book-Author'] = df['Book-Author'].apply(lambda x : x.strip())
  df['Book-Author'] = df['Book-Author'].str.lower()

  # Categorizing 'Year-Of-Publication'
  df['Year-Of-Publication'] = df['Year-Of-Publication'].apply(lambda x : 1990 if x < 1800 else x // 10 * 10)
  df['Year-Of-Publication'] = df['Year-Of-Publication'].astype(int).astype(str) + ' Year'

  # Manipulating 'Publisher'
  df['Publisher'] = df['Publisher'].apply(lambda x : re.sub(pattern, '', x))
  df['Publisher'] = df['Publisher'].apply(lambda x : x.strip())
  df['Publisher'] = df['Publisher'].str.lower()

  # New Feature
  # df['Clustering'] = df['Book-Rating'].apply(lambda x : 0 if x == 0 else 1)

  # Drop the column 'Book-Title'
  # df = df[['User-ID', 'Book-ID', 'Gen', 'City', 'State', 'Book-Author', 'Year-Of-Publication', 'Publisher', ' Clustering']]
  df = df[['User-ID', 'Book-ID', 'Gen', 'City', 'State', 'Book-Author', 'Year-Of-Publication', 'Publisher']]

  return df

# Hyperparameter Tuner
def objective(trial, seed, cat_columns, X_train, X_valid, y_train, y_valid) :

  # Define the params
  ir = trial.suggest_int('iterations', 100, 1000)
  lr = trial.suggest_float('learning_rate', 1e-3, 0.1)
  dp = trial.suggest_int('depth', 1, 10)
  l2 = trial.suggest_int('l2_leaf_reg', 1, 10)
  rs = trial.suggest_float('random_strength', 0.1, 10)
  bt = trial.suggest_float('bagging_temperature', 0.1, 10)
  ss = trial.suggest_float('subsample', 0.01, 1.0)
  md = trial.suggest_int('min_data_in_leaf', 1, 100)

  # List params
  cat_params = {
      'objective' : 'RMSE',
      'random_state' : seed,
      'task_type' : 'GPU',
      'iterations' : ir,
      'learning_rate' : lr,
      'depth' : dp,
      'l2_leaf_reg' : l2,
      'random_strength' : rs,
      'bagging_temperature' : bt,
      'subsample' : ss,
      'min_data_in_leaf' : md,
      'bootstrap_type' : 'Poisson'
  }

  # Define Model
  cbrm = CatBoostRegressor(
      **cat_params,
      cat_features = cat_columns
  )

  max_epochs = 1

  # Fitting the model
  cbrm.fit(
      X_train, y_train,
      eval_set=[(X_valid, y_valid)],
      early_stopping_rounds=100,
      verbose=1000
  )

  pred = cbrm.predict(X_valid)
  MSE = mean_squared_error(y_valid, pred)

  # return minimun loss
  return MSE

#**<font color='white'>1. Load Dataset & Preprocessing</font>**

In [5]:
# Load Dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submit = pd.read_csv('sample_submission.csv')

In [6]:
label = train_df['Book-Rating']
train_df = preprocessor(train_df)

In [7]:
test_df = preprocessor(test_df)
test_df

Unnamed: 0,User-ID,Book-ID,Gen,City,State,Book-Author,Year-Of-Publication,Publisher
0,USER_00008,BOOK_047966,30 Gen,vermilion,ohio,ja jance,2000 Year,avon
1,USER_00008,BOOK_119494,30 Gen,vermilion,ohio,john saul,2000 Year,ballantine books
2,USER_00008,BOOK_151775,30 Gen,vermilion,ohio,david m noer,1990 Year,josseybass
3,USER_00008,BOOK_176255,30 Gen,vermilion,ohio,ann rule,1990 Year,pocket
4,USER_00008,BOOK_187307,30 Gen,vermilion,ohio,betty j eadie,1990 Year,bantam books
...,...,...,...,...,...,...,...,...
159616,USER_92086,BOOK_159050,30 Gen,mountain view,california,ann louise gittleman,2000 Year,mcgrawhillcontemporary books
159617,USER_92086,BOOK_196481,30 Gen,mountain view,california,david howarth,1990 Year,the lyons press
159618,USER_92086,BOOK_199754,30 Gen,mountain view,california,raymond floyd,1990 Year,perennial
159619,USER_92086,BOOK_227481,30 Gen,mountain view,california,susan fox rogers,1990 Year,seal press wa


#**<font color='white'>2. Split Train & Valid Dataset</font>**

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df, label, test_size=0.1, random_state=Seed, stratify=label)

#**<font color='white'>3. Hyperparameter Tuning</font>**

In [9]:
cat_columns = ['User-ID', 'Book-ID', 'Gen', 'City', 'State', 'Book-Author', 'Year-Of-Publication', 'Publisher']

trace = optuna.create_study(study_name='CatBoostRegressor', direction="minimize")
trace.optimize(lambda trial : objective(trial, Seed, cat_columns, X_train, X_valid, y_train, y_valid), n_trials=1)
print()
print("Best MSE: ", trace)

[I 2024-02-05 23:31:07,641] A new study created in memory with name: CatBoostRegressor


0:	learn: 3.8349366	test: 3.8338007	best: 3.8338007 (0)	total: 73.8ms	remaining: 15.6s
211:	learn: 3.3669380	test: 3.3054476	best: 3.3054476 (211)	total: 13.6s	remaining: 0us
bestTest = 3.305447632
bestIteration = 211


[I 2024-02-05 23:31:30,600] Trial 0 finished with value: 10.925985072327338 and parameters: {'iterations': 212, 'learning_rate': 0.016925248452611036, 'depth': 5, 'l2_leaf_reg': 4, 'random_strength': 3.140427121705059, 'bagging_temperature': 7.229020828035751, 'subsample': 0.5876900496382693, 'min_data_in_leaf': 86}. Best is trial 0 with value: 10.925985072327338.



Best MSE:  <optuna.study.study.Study object at 0x7ac9048351e0>


#**<font color='white'>4. Model Training & Predict </font>**

In [10]:
kf = StratifiedKFold(n_splits = 10, random_state = Seed, shuffle = True)

cat_params=trace.best_params
cat_params.update({
    'objective' : 'RMSE',
    'random_state' : Seed,
    'task_type' : 'GPU',
    'bootstrap_type' : 'Poisson',
    'cat_features': cat_columns
})

pred = np.zeros(test_df.shape[0])
for idx, split in enumerate(kf.split(train_df, label)) :
  X_train, y_train = train_df.loc[split[0]], label[split[0]]
  X_valid, y_valid = train_df.loc[split[1]], label[split[1]]

  cbrm = CatBoostRegressor(**cat_params)
  cbrm.fit(
      X_train, y_train,
      eval_set=[(X_valid, y_valid)],
      early_stopping_rounds=100,
      verbose=1000
  )

  k_pred = cbrm.predict(test_df) / kf.n_splits
  pred += k_pred

0:	learn: 3.8349127	test: 3.8338764	best: 3.8338764 (0)	total: 68.8ms	remaining: 14.5s
211:	learn: 3.3665860	test: 3.3097561	best: 3.3097561 (211)	total: 13.6s	remaining: 0us
bestTest = 3.309756124
bestIteration = 211
0:	learn: 3.8349399	test: 3.8339616	best: 3.8339616 (0)	total: 69.1ms	remaining: 14.6s
211:	learn: 3.3676757	test: 3.3098988	best: 3.3098988 (211)	total: 14.9s	remaining: 0us
bestTest = 3.30989882
bestIteration = 211
0:	learn: 3.8349045	test: 3.8340542	best: 3.8340542 (0)	total: 71ms	remaining: 15s
211:	learn: 3.3659322	test: 3.3147127	best: 3.3147127 (211)	total: 13.5s	remaining: 0us
bestTest = 3.314712653
bestIteration = 211
0:	learn: 3.8349446	test: 3.8339055	best: 3.8339055 (0)	total: 71.2ms	remaining: 15s
211:	learn: 3.3663975	test: 3.3075090	best: 3.3075090 (211)	total: 13.7s	remaining: 0us
bestTest = 3.307509013
bestIteration = 211
0:	learn: 3.8349322	test: 3.8339203	best: 3.8339203 (0)	total: 68.8ms	remaining: 14.5s
211:	learn: 3.3667198	test: 3.3068239	best: 3.30

#**<font color='white'>5. Submit </font>**

In [12]:
submit['Book-Rating'] = pred
submit.loc[submit['Book-Rating'] > 10, 'Book-Rating'] = 10
submit.loc[submit['Book-Rating'] < 0, 'Book-Rating'] = 0

submit
submit.to_csv('submit.csv', index=False)