In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
import pickle
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [4]:
import mlflow
mlflow.set_tracking_uri('[your-mlflow-server]')
mlflow.set_experiment('restaurant_customer-satisfaction')


2024/07/18 10:41:35 INFO mlflow.tracking.fluent: Experiment with name 'restaurant_customer-satisfaction' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlflow-artifacts-823124982163/7', creation_time=1721313695589, experiment_id='7', last_update_time=1721313695589, lifecycle_stage='active', name='restaurant_customer-satisfaction', tags={}>

In [23]:
def read_data(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    df['Gender']=df.Gender.astype('category')
    df['VisitFrequency']=df.VisitFrequency.astype('category')
    df['PreferredCuisine']=df.PreferredCuisine.astype('category')
    df['TimeOfVisit']=df.TimeOfVisit.astype('category')
    df['DiningOccasion']=df.DiningOccasion.astype('category')
    df['MealType']=df.MealType.astype('category')
    df['DiningOccasion']=df.DiningOccasion.astype('category')
    df['Income_per_AverageSpend'] = df['Income'] / df['AverageSpend']
    df['AverageSpend_per_GroupSize'] = df['AverageSpend'] / df['GroupSize']
    df['Income_per_GroupSize'] = df['Income'] / df['GroupSize']

    for col in df.columns:
        if df[col].dtype != 'category':
            df[col] = df[col].astype(float)
    return df
    

In [24]:
df_data = read_data('../data/restaurant_customer_satisfaction.csv')

In [25]:
df_data.head

<bound method NDFrame.head of       CustomerID   Age  Gender    Income VisitFrequency  AverageSpend  \
0          654.0  35.0    Male   83380.0         Weekly     27.829142   
1          655.0  19.0    Male   43623.0         Rarely    115.408622   
2          656.0  41.0  Female   83737.0         Weekly    106.693771   
3          657.0  43.0    Male   96768.0         Rarely     43.508508   
4          658.0  55.0  Female   67937.0        Monthly    148.084627   
...          ...   ...     ...       ...            ...           ...   
1495      2149.0  39.0    Male  114857.0        Monthly    163.015254   
1496      2150.0  37.0  Female  133506.0         Weekly    190.991911   
1497      2151.0  46.0    Male  119159.0        Monthly    150.088604   
1498      2152.0  24.0    Male   27970.0         Weekly    196.363626   
1499      2153.0  51.0    Male  148333.0         Weekly    171.119498   

     PreferredCuisine TimeOfVisit  GroupSize DiningOccasion  ...  \
0             Chinese   B

In [26]:
columns_to_dummy = [
        "VisitFrequency",
        "PreferredCuisine",
        "TimeOfVisit",
        "DiningOccasion",
    ]
columns_to_encode = ["Gender", "MealType"]
df_data = pd.get_dummies(df_data, columns=columns_to_dummy, dtype=int, drop_first=True)
labelencoder = LabelEncoder()

for col in columns_to_encode:
    df_data[col] = labelencoder.fit_transform(df_data[col])



In [27]:
df_data

Unnamed: 0,CustomerID,Age,Gender,Income,AverageSpend,GroupSize,MealType,OnlineReservation,DeliveryOrder,LoyaltyProgramMember,...,VisitFrequency_Rarely,VisitFrequency_Weekly,PreferredCuisine_Chinese,PreferredCuisine_Indian,PreferredCuisine_Italian,PreferredCuisine_Mexican,TimeOfVisit_Dinner,TimeOfVisit_Lunch,DiningOccasion_Casual,DiningOccasion_Celebration
0,654.0,35.0,1,83380.0,27.829142,3.0,1,0.0,1.0,1.0,...,0,1,1,0,0,0,0,0,0,0
1,655.0,19.0,1,43623.0,115.408622,1.0,0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,1,0
2,656.0,41.0,0,83737.0,106.693771,6.0,0,0.0,1.0,0.0,...,0,1,0,0,0,0,1,0,0,1
3,657.0,43.0,1,96768.0,43.508508,1.0,0,0.0,0.0,0.0,...,1,0,0,1,0,0,0,1,0,1
4,658.0,55.0,0,67937.0,148.084627,1.0,1,0.0,0.0,1.0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,2149.0,39.0,1,114857.0,163.015254,2.0,0,0.0,1.0,1.0,...,0,0,0,0,0,0,0,1,0,0
1496,2150.0,37.0,0,133506.0,190.991911,4.0,1,0.0,0.0,0.0,...,0,1,0,0,1,0,0,1,1,0
1497,2151.0,46.0,1,119159.0,150.088604,4.0,0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,1,1,0
1498,2152.0,24.0,1,27970.0,196.363626,6.0,0,1.0,1.0,0.0,...,0,1,0,0,1,0,1,0,1,0


In [29]:
num_columns = [
        "Age",
        "Income",
        "AverageSpend",
        "GroupSize",
        "WaitTime",
        "ServiceRating",
        "FoodRating",
        "AmbianceRating",
    ]
scaler = StandardScaler()
x_scaled = scaler.fit_transform(df_data[num_columns])
df_data[num_columns] = pd.DataFrame(x_scaled, columns=num_columns)

x = df_data.drop(target, axis = 1)
y = df_data[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)



In [30]:
def training(model, x_train, y_train, x_val, y_val, params):
    with mlflow.start_run(run_name='random_forest_classifier'):
        mlflow.set_tag("model", "random forest")
        mlflow.set_tag("developer", "daisy_lin")
        mlflow.log_param("train-data-path", "restaurant_customer_satisfaction.csv")
        mlflow.log_params(params)
    
        model.fit(x_train, y_train)

        y_pred = model.predict(x_val)
        roc_auc = roc_auc_score(y_val, y_pred)
        acc = accuracy_score(y_val, y_pred)
        mlflow.log_metric("roc", roc_auc)
        mlflow.log_metric("accurancy", acc)
        with open('models/lin_rf.bin', 'wb') as f_out:
            pickle.dump((scaler, labelencoder, model), f_out)

        mlflow.log_artifact(local_path="models/lin_rf.bin", artifact_path="models_pickle")
    return model


In [31]:
params_rf = {'max_depth': 40,
         'min_samples_leaf': 2,
         'min_samples_split': 4,
         'n_estimators': 1000,
         'random_state': 40}

model_rf = RandomForestClassifier(**params_rf)

In [32]:
training(model_rf, x_train, y_train, x_test, y_test, params_rf)

In [59]:
def xgb_training(x_train, y_train, x_test, y_test):
    with mlflow.start_run(run_name='xgboost_classifier'):
        mlflow.set_tag("model", "xgboost")
        mlflow.set_tag("developer", "daisy_lin")
        def objective(space):
            xgb_model=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
            xgb_model.fit(x_train, y_train)
            y_pred = xgb_model.predict(x_test)
            roc_auc = roc_auc_score(y_test, y_pred)
            score = accuracy_score(y_test, y_pred)
            
            return {'loss': -score, 'status': STATUS_OK}
        space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }
        trials = Trials()
        best_params = fmin(objective, space, algo=tpe.suggest, max_evals=100, trials = trials)

        mlflow.log_params(best_params)
        params = {'max_depth' : best_params['max_depth'].astype(int),
        'gamma' : best_params['gamma'].astype(int),
        'reg_alpha' : best_params['reg_alpha'].astype(int) ,
        'reg_lambda' : best_params['reg_lambda'].astype(int) ,
        'colsample_bytree' : best_params['colsample_bytree'].astype(int)
        }
        xgb_model_best = xgb.XGBClassifier(**params)
        xgb_model_best.fit(x_train, y_train)
        y_pred = xgb_model_best.predict(x_test)
        roc_auc = roc_auc_score(y_test, y_pred)
        score = accuracy_score(y_test, y_pred)
        mlflow.log_metric("roc", roc_auc)
        mlflow.log_metric("accurancy", score)
        
        with open('models/lin_xbg.bin', 'wb') as f_out:
            pickle.dump((scaler, labelencoder, xgb_model_best), f_out)

        mlflow.log_artifact(local_path="models/lin_xbg.bin", artifact_path="models_pickle")



In [60]:
xgb_training(x_train, y_train, x_test, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:10<00:00,  9.13trial/s, best loss: -0.8633333333333333]
