In [1]:
import pandas as pd
import pickle
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb

# global constants
TARGET = "price_range"

## data preparation

In [11]:
# load data
df = pd.read_csv("data/train.csv")
feat_dict = pickle.load(open("data/feat_dict.pickle", "rb"))

In [3]:
# train test split
xtr, xts, ytr, yts = train_test_split(
    df.drop(TARGET, axis=1), df[TARGET], 
    test_size=0.2, random_state=42
)

print(xtr.shape, xts.shape, ytr.shape, yts.shape)

(1600, 20) (400, 20) (1600,) (400,)


## random forest

### base model

In [8]:
rf = RandomForestClassifier(random_state=42)
rf.fit(xtr, ytr)
ypr = rf.predict(xts)

print(classification_report(yts, ypr))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96       105
           1       0.89      0.87      0.88        91
           2       0.78      0.87      0.82        92
           3       0.94      0.87      0.90       112

    accuracy                           0.89       400
   macro avg       0.89      0.89      0.89       400
weighted avg       0.90      0.89      0.89       400



### parameters optimization

In [9]:
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],  # List of values to test for the number of trees
    'max_depth': [None, 10, 20, 30],  # List of values to test for the maximum depth of trees
    'min_samples_split': [2, 5, 10],  # List of values to test for minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4]  # List of values to test for minimum samples required in a leaf node
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

grid_search.fit(xtr, ytr)
print("Best Hyperparameters:", grid_search.best_params_)

rf = grid_search.best_estimator_
ypr = rf.predict(xts)

print(classification_report(yts, ypr))

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       105
           1       0.89      0.86      0.87        91
           2       0.80      0.86      0.83        92
           3       0.93      0.88      0.91       112

    accuracy                           0.89       400
   macro avg       0.89      0.89      0.89       400
weighted avg       0.89      0.89      0.89       400



## xgboost

### base model

In [6]:
model = xgb.XGBClassifier()

model.fit(xtr, ytr)
ypr = model.predict(xts)

print(classification_report(yts, ypr))

              precision    recall  f1-score   support

           0       0.96      0.93      0.95       105
           1       0.85      0.95      0.90        91
           2       0.85      0.85      0.85        92
           3       0.94      0.88      0.91       112

    accuracy                           0.90       400
   macro avg       0.90      0.90      0.90       400
weighted avg       0.90      0.90      0.90       400



### parameters optimization

In [10]:
model = xgb.XGBClassifier()

param_grid = {
    'max_depth': [3, 4, 5],          # Maximum depth of trees
    'learning_rate': [0.01, 0.1],   # Learning rate
    'n_estimators': [100, 200],      # Number of boosting rounds (trees)
    'subsample': [0.8, 1.0],        # Fraction of samples used for training
    'colsample_bytree': [0.8, 1.0]  # Fraction of features used for training
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

grid_search.fit(xtr, ytr)
print("Best Hyperparameters:", grid_search.best_params_)

model = grid_search.best_estimator_
ypr = model.predict(xts)

print(classification_report(yts, ypr))

Best Hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
              precision    recall  f1-score   support

           0       0.96      0.93      0.95       105
           1       0.86      0.93      0.89        91
           2       0.86      0.84      0.85        92
           3       0.93      0.90      0.91       112

    accuracy                           0.90       400
   macro avg       0.90      0.90      0.90       400
weighted avg       0.90      0.90      0.90       400

