In [25]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
import pickle
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [3]:
import mlflow
mlflow.set_tracking_uri('http://MLflow-MLFLO-QpxAlKAMoAEb-7f0b3c0531d6ae08.elb.us-east-1.amazonaws.com')
mlflow.set_experiment('customer-satisfaction_ml')


<Experiment: artifact_location='s3://mlflow-artifacts-823124982163/5', creation_time=1720550466265, experiment_id='5', last_update_time=1720550466265, lifecycle_stage='active', name='customer-satisfaction_ml', tags={}>

In [4]:
def read_data(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    df['Gender']=df.Gender.astype('category')
    df['VisitFrequency']=df.VisitFrequency.astype('category')
    df['PreferredCuisine']=df.PreferredCuisine.astype('category')
    df['TimeOfVisit']=df.TimeOfVisit.astype('category')
    df['DiningOccasion']=df.DiningOccasion.astype('category')
    df['MealType']=df.MealType.astype('category')
    df['DiningOccasion']=df.DiningOccasion.astype('category')
    df['Income_per_AverageSpend'] = df['Income'] / df['AverageSpend']
    df['AverageSpend_per_GroupSize'] = df['AverageSpend'] / df['GroupSize']
    df['Income_per_GroupSize'] = df['Income'] / df['GroupSize']

    for col in df.columns:
        if df[col].dtype != 'category':
            df[col] = df[col].astype(float)
    return df
    

In [5]:
df = read_data('../data/restaurant_customer_satisfaction.csv')

In [6]:
def data_encode(df):
    columns_to_dummy = ['VisitFrequency', 'PreferredCuisine', 'TimeOfVisit', 'DiningOccasion']
    columns_to_encode = ['Gender', 'MealType']
    df = pd.get_dummies(df, columns=columns_to_dummy, dtype=int, drop_first=True)
    labelencoder = LabelEncoder()
    
    for col in columns_to_encode:
        df[col] = labelencoder.fit_transform(df[col])
    return df

df = data_encode(df)


In [7]:
num_columns = ['Age', 'Income', 'AverageSpend', 'GroupSize', 'WaitTime', 'ServiceRating', 'FoodRating', 'AmbianceRating']
scaler = StandardScaler()
x_scaled = scaler.fit_transform(df[num_columns])
df[num_columns] = pd.DataFrame(x_scaled, columns=num_columns)

x = df.drop('HighSatisfaction', axis=1)
y = df['HighSatisfaction']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)



In [15]:
def training(model, x_train, y_train, x_val, y_val, params):
    with mlflow.start_run(run_name='mlops_project'):

        mlflow.set_tag("developer", "daisy_lin")

        mlflow.log_param("train-data-path", "data/restaurant_customer_satisfaction.csv")

    # params_rf = {'max_depth': 50,
    #      'min_samples_leaf': 1,
    #      'min_samples_split': 2,
    #      'n_estimators': 1200,
    #      'random_state': 42}
        mlflow.log_params(params)
    
        model.fit(x_train, y_train)

        y_pred = model.predict(x_val)
        roc_auc = roc_auc_score(y_val, y_pred)
        acc = accuracy_score(y_val, y_pred)
        mlflow.log_metric("roc", roc_auc)
        mlflow.log_metric("accurancy", acc)
        with open('models/lin_rf.bin', 'wb') as f_out:
            pickle.dump((scaler, model_rf), f_out)

        mlflow.log_artifact(local_path="models/lin_rf.bin", artifact_path="models_pickle")
    return model


In [16]:
params_rf = {'max_depth': 40,
         'min_samples_leaf': 2,
         'min_samples_split': 4,
         'n_estimators': 1000,
         'random_state': 40}

model_rf = RandomForestClassifier(**params_rf)

In [17]:
training(model_rf, x_train, y_train, x_test, y_test, params_rf)

In [34]:
def xgb_training(x_train, y_train, x_test, y_test):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        def objective(space):
            xgb_model=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
            xgb_model.fit(x_train, y_train)
            y_pred = xgb_model.predict(x_test)
            roc_auc = roc_auc_score(y_val, y_pred)
            score = accuracy_score(y_test, y_pred)
            mlflow.log_metric("roc", roc_auc)
            mlflow.log_metric("accurancy", score)
            return {'loss': -score, 'status': STATUS_OK}
        space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }
        trials = Trials()
        best_params = fmin(objective, space, algo=tpe.suggest, max_evals=100, trials = trials)

        mlflow.log_params(best_params)
        # model.fit(x_train, y_train)
        # y_pred = model.predict(x_val)
        # 
        # acc = accuracy_score(y_test, y_pred)
        
        with open('models/lin_xbg.bin', 'wb') as f_out:
            pickle.dump((scaler, model_rf), f_out)

        mlflow.log_artifact(local_path="models/lin_xbg.bin", artifact_path="models_pickle")



In [None]:
xgb_training(x_train, y_train, x_test, y_test)