In [2]:
import sys
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import gc

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import matplotlib.ticker as ticker
import seaborn as sns

from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedKFold, RepeatedStratifiedKFold, GroupKFold
from sklearn.inspection import PartialDependenceDisplay
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR

from ydf import RandomForestLearner, GradientBoostedTreesLearner
import ydf

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# csvをimportする

In [3]:
train = pd.read_csv(f"../data/train.csv", sep=',')
test = pd.read_csv(f"../data/test.csv", sep=',')
train.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [4]:
extra_data = pd.read_csv(f"../data/training_extra.csv", sep=',')
extra_data.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,500000,Under Armour,Canvas,Small,10.0,Yes,Yes,Tote,Blue,23.882052,114.11068
1,500001,Puma,Polyester,Small,4.0,No,Yes,Backpack,Green,11.869095,129.74972
2,500002,Jansport,Polyester,Small,8.0,Yes,Yes,Tote,Red,8.092302,21.3737
3,500003,Nike,Nylon,Large,7.0,No,No,Messenger,Pink,7.719581,48.09209
4,500004,Nike,Leather,Large,9.0,No,Yes,Tote,Green,22.741826,77.32461


In [5]:
sample_submission = pd.read_csv(f"../data/sample_submission.csv", sep=',')
sample_submission.head()

Unnamed: 0,id,Price
0,300000,81.411
1,300001,81.411
2,300002,81.411
3,300003,81.411
4,300004,81.411


# RandomForestLearner model over a 5-fold cross validation strategy.

In [7]:
skf = RepeatedKFold(n_splits=5, n_repeats=1, random_state=42)

ydf.verbose(-1)
scores, ydf_test_preds = [], []
for i, (train_index, test_index) in enumerate(skf.split(train)):

    print(f"------------ Working on Fold {i} ------------")
            
    X_train, X_test = train.iloc[train_index], train.iloc[test_index]
    
    ydf_md = RandomForestLearner(label='Price', 
                                 task=ydf.Task.REGRESSION, 
                                 num_threads=10, 
                                 num_trees=1000).train(X_train)
    ydf_pred = ydf_md.predict(X_test)

    score = mean_squared_error(X_test['Price'], ydf_pred, squared=False)
    print('Fold:', i, 'RMSE:', score)
    scores.append(score)

    ydf_test_preds.append(ydf_md.predict(test))

ydf_gb_oof_score = np.mean(scores)  
ydf_gb_std = np.std(scores)
print(f"The 5-fold average oof RMSE score of the RandomForestLearner model is {ydf_gb_oof_score}")
print(f"The 5-fold std oof RMSE score of the RandomForestLearner model is {ydf_gb_std}")

------------ Working on Fold 0 ------------
Fold: 0 RMSE: 38.9264409447617
------------ Working on Fold 1 ------------
Fold: 1 RMSE: 39.06147046223175
------------ Working on Fold 2 ------------
Fold: 2 RMSE: 39.044670658592274
------------ Working on Fold 3 ------------
Fold: 3 RMSE: 39.081248468946676
------------ Working on Fold 4 ------------
Fold: 4 RMSE: 39.0267872144942
The 5-fold average oof RMSE score of the RandomForestLearner model is 39.02812354980532
The 5-fold std oof RMSE score of the RandomForestLearner model is 0.05394296686664424


# csvをpreprocess_resultsに作成


In [12]:
sample_submission["Price"] = np.mean(ydf_test_preds, axis=0)
display(sample_submission.head())

path = f"../model_results/baseline_RF_sub.csv"
sample_submission.to_csv(path, index=False)

Unnamed: 0,id,Price
0,300000,90.431511
1,300001,88.220993
2,300002,87.255913
3,300003,89.270363
4,300004,79.143723
