In [1]:
import os
import configparser
from tqdm import tqdm
from helpers.helper_functions import *
from helpers.helper_classes import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import lightgbm as lgb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def calculate_ndcg_prepare_results(y_pred, X_test, test_ideal):
    df_res = X_test.copy()
    df_res['pred_grades'] = y_pred
    df_res = df_res.sort_values(by=['srch_id', 'pred_grades'], ascending=[True, False], inplace=False)
    df_res = df_res.merge(test_ideal, on=['srch_id', 'prop_id'], how='left')

    ndcg = calc_NDCG(test_ideal, df_res)
    print(f"result final:{ndcg}")

    return ndcg, df_res

def fill_nan_except(df, fill_value, exclude_cols, replace_inf=False):
    # Get the list of columns to fill NaNs
    fill_cols = [col for col in df.columns if col not in exclude_cols]
    
    # Fill NaNs with the specified value in the fill_cols
    df[fill_cols] = df[fill_cols].fillna(fill_value)
    
    # Replace infinite values if specified
    if replace_inf:
        df = df.replace([np.inf, -np.inf], fill_value)
    
    return df

def get_top_features(df, feature_imp, n_features):
    # Get n_features most important features from feature importance and subset df
    features = feature_imp.iloc[:n_features]['Feature'].tolist()
    # insert srch_id to features
    features.insert(0, 'srch_id')
    
    subset_df = df[features]
    return subset_df



In [3]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
os.chdir(config['PATH']['ROOT_DIR'])

# # Load data
df = pd.read_parquet(config['PATH']['INT_DIR'] + '/training_set_preprocessed_nodrop.parquet', engine = 'auto')
df_test = pd.read_parquet(config['PATH']['INT_DIR'] + '/test_set_preprocessed_nodrop.parquet', engine = 'auto')
df_mini = df.sample(frac=0.1, random_state=7)

feature_imp = pd.read_csv(config['PATH']['INT_DIR'] + '/feature_importance.csv', index_col=0)

categorical_features = ['hour', 'day', 'month', 'day_of_week', 'site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id', 'srch_destination_id']

for c in categorical_features:
    df[c] = df[c].astype('category')
    df_test[c] = df_test[c].astype('category')



## Construct training and validation set

In [4]:
X_feature, X_train, X_test, y_feature, y_train, y_test, test_ideal = train_val_test_split(df, 'target', test_size=.15, val_size=.03, random_state=7)

_, desire_df_click = construct_desire(X_feature)
_, desire_df_book = construct_desire(X_feature, target = 'booking_bool')

prop_counts = X_feature['prop_id'].value_counts()
prop_counts.name = 'prop_counts'
prop_counts = pd.DataFrame({'prop_id':prop_counts.index, 'count':prop_counts.values})

srch_dest_counts = X_feature['srch_destination_id'].value_counts()
srch_dest_counts.name = 'srch_dest_counts'
srch_dest_counts = pd.DataFrame({'srch_destination_id':srch_dest_counts.index, 'count':srch_dest_counts.values})

merge_df_list = [(desire_df_click, 'prop_id'), (desire_df_book, 'prop_id'), (prop_counts, 'prop_id'), (srch_dest_counts, 'srch_destination_id')]   

X_train_int = merge_and_drop(X_train, merge_df_list)
X_test_int = merge_and_drop(X_test, merge_df_list)



# Fill na's for all columns but: ['hour', 'day', 'month', 'day_of_week', 'site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id', 'srch_destination_id']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.sort_values(by=['srch_id', target_str], ascending=[True, False], inplace=True)


In [15]:
X_train = get_top_features(X_train_int, feature_imp, 25)
X_test = get_top_features(X_test_int, feature_imp, 25)

X_train = fill_nan_except(X_train, 0, categorical_features, replace_inf=True)
X_test = fill_nan_except(X_test, 0, categorical_features, replace_inf=True)

# # Random order of X_test
y_test.reset_index(drop=True, inplace=True)
X_test = X_test.sample(frac=1, random_state=7)
y_test = y_test.loc[X_test.index]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[fill_cols] = df[fill_cols].fillna(fill_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[fill_cols] = df[fill_cols].fillna(fill_value)


## KNN

In [None]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import wandb

def objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 30)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
    leaf_size = trial.suggest_int('leaf_size', 1, 50)
    p = trial.suggest_categorical('p', [1, 2])

    params = {
        'n_neighbors': n_neighbors,
        'weights': weights,
        'algorithm': algorithm,
        'leaf_size': leaf_size,
        'p': p
    }

    wandb.init(project='DMT-2023', group = 'KNN_optuna_day2', config = params, reinit = True, allow_val_change=True)

    knn = KNeighborsRegressor(
        n_neighbors=n_neighbors,
        weights=weights,
        algorithm=algorithm,
        leaf_size=leaf_size,
        p=p
    )

    knn.fit(X_train.drop(['srch_id'], axis=1), y_train)
    pred_knn = knn.predict(X_test.drop(['srch_id'], axis=1))
    ndcg_knn, _ = calculate_ndcg_prepare_results(pred_knn, X_test, test_ideal)

    wandb.log({'ndcg_final': ndcg_knn})
    wandb.finish()
    return ndcg_knn

print("Running optuna study...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Save params to txt
with open('paramsknn.txt', 'w') as f:
    print(f"  Value: {trial.value}", file=f)
    print("  Params: ", file=f)
    for key, value in trial.params.items():
        print(f"    {key}: {value}", file=f)


## Random Forest

In [6]:
# Random forest regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

params_rf = {'n_estimators': 872, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True, 'random_state': 42, 'n_jobs': -1}

rf = RandomForestRegressor(**params_rf)
print("Fitting...")
rf.fit(X_train.drop(['srch_id'], axis=1), y_train)
print("Done fitting")
print("Predicting...")
pred_rf = rf.predict(X_test.drop(['srch_id'], axis=1))
print("Done predicting")

ndcg_rf, df_res_rf = calculate_ndcg_prepare_results(pred_rf, X_test, test_ideal)
print(f"result final:{ndcg_rf}")

Fitting...
Done fitting
Predicting...
Done predicting
result final:0.3697587048324351
result final:0.3697587048324351


In [None]:
import optuna
from sklearn.metrics import ndcg_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import wandb

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 5)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt'])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    params = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features,
        'bootstrap': bootstrap,
        'random_state': 42,
        'n_jobs': -1
    }

    wandb.init(project='DMT-2023', group = 'Random_forest_optuna', config = params, reinit = True, allow_val_change=True)


    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=42, 
        n_jobs=-1
    )

    # Assuming your calculate_ndcg_prepare_results function is something similar to the ndcg_score 
    # And also assuming you are returning ndcg_score from calculate_ndcg_prepare_results
    # Adjust this as per your implementation of calculate_ndcg_prepare_results function
    rf.fit(X_train.drop(['srch_id'], axis=1), y_train)
    pred_rf = rf.predict(X_test.drop(['srch_id'], axis=1))
    ndcg_rf, _ = calculate_ndcg_prepare_results(pred_rf, X_test, test_ideal)

    wandb.log({'ndcg_final': ndcg_rf})
    wandb.finish()
    return ndcg_rf

print("Running optuna study...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Save params to txt
with open('paramsrf.txt', 'w') as f:
    print(f"  Value: {trial.value}", file=f)
    print("  Params: ", file=f)
    for key, value in trial.params.items():
        print(f"    {key}: {value}", file=f)
