In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# data_preprocessing.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    # Clean the data
    df = df[df['Age'] > 0]
    df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay']).dt.dayofweek
    df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay']).dt.dayofweek
    le = LabelEncoder()
    df['Gender'] = le.fit_transform(df['Gender'])
    df = df.drop(['PatientId', 'AppointmentID', 'Neighbourhood'], axis=1)
    df['No-show'] = df['No-show'].map({'Yes': 1, 'No': 0})

    X = df[['Age', 'ScheduledDay', 'AppointmentDay', 'Gender']]
    y = df['No-show']

    return train_test_split(X, y, test_size=0.2, random_state=42)


In [1]:
import pandas as pd
import numpy as np

df=pd.DataFrame({
    'Wait_time':np.random.randint(1,60,100),
    'Staff_friendliness':np.random.randint(1,5,100),
    'Cleanliness':np.random.randint(1,5,100),
    'quality_care':np.random.randint(1,5,100),
    'satifaction':np.random.randint(1,10,100),
})

df.head()

Unnamed: 0,Wait_time,Staff_friendliness,Cleanliness,quality_care,satifaction
0,5,3,4,1,7
1,42,3,1,4,6
2,58,2,4,1,5
3,41,2,4,2,2
4,39,2,3,2,6


In [2]:
df.shape

(100, 5)

In [3]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split



In [4]:
X, y = df.drop('satifaction', axis=1), df['satifaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs=-1, random_state=42)

try :
    model.fit(X_train, y_train)
except Exception as e:
    print(e)

In [20]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 7.534789999999999


In [21]:
# Creating our own evaluation metric (the competition uses RMSLE)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score, mean_squared_error

def rmsle(y_test, y_preds):
    """
    Calculates Root Mean Squared Log Error between prediction and
    true labels
    """
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# Create function to evaluate model on a few different levels
def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_test)
    scores = {"Training MAE" : mean_absolute_error(y_train, train_preds),
             "Valid MAE" : mean_absolute_error(y_test, val_preds),
             "Training RMSLE" : rmsle(y_train, train_preds),
             "Valid RMSLE" : rmsle(y_test, val_preds),
             "Training MSE" : mean_squared_error(y_train, train_preds),
             "Valid MSE" : mean_squared_error(y_test, val_preds),
             "Training R^2" : r2_score(y_train, train_preds),
             "Valid R^2" : r2_score(y_test, val_preds)}
    return scores, train_preds, val_preds

In [22]:
report, train_preds, val_preds = show_scores(model)

print("Validation Report...")

print("MAE : ", report["Valid MAE"])
print("MSE : ", report["Valid MSE"])
print("RMSLE : ", report["Valid RMSLE"])
print("R-squared Error : ", report["Valid R^2"])

Validation Report...
MAE :  2.268
MSE :  7.534789999999999
RMSLE :  0.5409104530634885
R-squared Error :  0.09870933014354066


In [24]:
def show_ideal_metrics(df):
    std_dev = df['satifaction'].std()
    half_std_dev = std_dev / 2
    print(f"Ideal MAE <= {half_std_dev}")

    variance = df['satifaction'].var()
    print(f"Ideal MSE <= {variance}")

    print("Ideal RMSLE < 0.1")
    print("Ideal R2_E > 0.5")
show_ideal_metrics(df)

Ideal MAE <= 1.3062012158016147
Ideal MSE <= 6.824646464646466
Ideal RMSLE < 0.1
Ideal R2_E > 0.5


In [28]:

from sklearn.model_selection import RandomizedSearchCV

rf_grid = {
    'n_estimators':np.arange(10, 100, 10),
    'max_depth':[None, 3, 5, 10],
    'min_samples_split':np.arange(2, 20, 2),
    'min_samples_leaf':np.arange(1, 20, 2),
    'max_features':[0.5, 1, "sqrt"],
    'max_samples':[50]
}

In [29]:
# Instantiate RandomizedSearchCV model
rs_model = RandomizedSearchCV(
    RandomForestRegressor(n_jobs=-1,
                         random_state=42),
    param_distributions=rf_grid,
    n_iter=50,
    cv=5,
    verbose=True)

In [30]:
%%time
# Fit the rs model
rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
CPU times: user 26.8 s, sys: 2.25 s, total: 29 s
Wall time: 32.1 s


In [31]:
# Find the best parameters
rs_model.best_params_

{'n_estimators': 20,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_samples': 50,
 'max_features': 'sqrt',
 'max_depth': 5}

In [32]:
# Evaluating the RandomizedSearch model
scores = show_scores(rs_model)[0]
scores

{'Training MAE': 1.783638581265226,
 'Valid MAE': 2.2418786144411142,
 'Training RMSLE': 0.411562893332865,
 'Valid RMSLE': 0.5313858520019625,
 'Training MSE': 4.5179397504002194,
 'Valid MSE': 6.8481070175302445,
 'Training R^2': 0.28844339003441766,
 'Valid R^2': 0.18084844287915725}

In [33]:
import pickle

In [34]:
filename = 'trained_model.sav'
pickle.dump(rs_model, open(filename, 'wb'))

In [35]:
from google.colab import files
files.download('trained_model.sav')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>