In [1]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/home/ricardormotta/projects/LTV_analysis/ltv-ml-project/conf/local/gcp_token.json"


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.calibration import calibration_curve 


from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV


In [3]:
X_train = catalog.load("X_train")
X_test = catalog.load("X_test")
y_train = catalog.load("y_train_ltv")
y_test = catalog.load("y_test_ltv")
CT = catalog.load("CT")

In [4]:
for X in [X_train, X_test]:
    X.loc[:,"days_to_churn"]=0

In [5]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
def regression_metrics(true_values, predicted_values, model_name):
    metrics = {
        'Mean Absolute Error (MAE)': mean_absolute_error(true_values, predicted_values),
        'Mean Squared Error (MSE)': mean_squared_error(true_values, predicted_values),
        'Root Mean Squared Error (RMSE)': mean_squared_error(true_values, predicted_values, squared=False),
        'R-squared': r2_score(true_values, predicted_values)
    }

    # Create a Pandas DataFrame from the metrics
    metrics_df = pd.DataFrame.from_dict(metrics, orient='index', columns=[model_name])
    
    return metrics_df


In [6]:
models = [
    RandomForestRegressor(),
    XGBRegressor(),
    GradientBoostingRegressor()
]
metrics = []
for model in models:
    pipe = Pipeline([
        ("CT", CT),
        ("model", model)
    ])
    pipe.fit(X_train, y_train)
    metrics.append(
        regression_metrics(y_train.values, pipe.predict(X_train), type(pipe["model"]).__name__+"_train")
    )
    metrics.append(
        regression_metrics(y_test.values, pipe.predict(X_test), type(pipe["model"]).__name__+"_test")
    )

In [7]:
pd.concat(metrics, axis=1)

Unnamed: 0,RandomForestRegressor_train,RandomForestRegressor_test,XGBRegressor_train,XGBRegressor_test,GradientBoostingRegressor_train,GradientBoostingRegressor_test
Mean Absolute Error (MAE),13.18683,13.135322,13.18409,13.134286,13.237728,13.147227
Mean Squared Error (MSE),257.918846,256.092356,257.899241,256.066801,259.06608,255.871837
Root Mean Squared Error (RMSE),16.059852,16.002886,16.059242,16.002087,16.09553,15.995994
R-squared,0.071042,0.064884,0.071113,0.064977,0.06691,0.065689


In [8]:
xgb = XGBRegressor()
parameters = {
    "n_estimators": [250, 500, 1000],
    "max_depth": [4, 6, 8]
}
clf = GridSearchCV(xgb, parameters, n_jobs=-1, verbose=3)
pipe = Pipeline([
    ("CT", CT),
    ("model", clf)
])
pipe.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [9]:
regression_metrics(y_test.values, pipe.predict(X_test), type(pipe["model"]).__name__)

Unnamed: 0,GridSearchCV
Mean Absolute Error (MAE),13.129393
Mean Squared Error (MSE),255.787752
Root Mean Squared Error (RMSE),15.993366
R-squared,0.065996


In [10]:
pipe["model"].best_params_

[1m{[0m[32m'max_depth'[0m: [1;36m4[0m, [32m'n_estimators'[0m: [1;36m250[0m[1m}[0m

In [11]:
y_test.mean()


months_as_client    [1;36m22.383696[0m
dtype: float64

[CV 1/5] END .....max_depth=4, n_estimators=500;, score=0.060 total time=   3.1s
[CV 3/5] END .....max_depth=6, n_estimators=250;, score=0.064 total time=   2.4s
[CV 5/5] END .....max_depth=6, n_estimators=500;, score=0.071 total time=   3.5s
[CV 2/5] END .....max_depth=8, n_estimators=500;, score=0.064 total time=   2.3s
[CV 2/5] END .....max_depth=4, n_estimators=500;, score=0.065 total time=   3.3s
[CV 5/5] END .....max_depth=6, n_estimators=250;, score=0.071 total time=   2.4s
[CV 1/5] END ....max_depth=6, n_estimators=1000;, score=0.060 total time=   5.6s
[CV 2/5] END ....max_depth=4, n_estimators=1000;, score=0.065 total time=   6.2s
[CV 4/5] END ....max_depth=6, n_estimators=1000;, score=0.066 total time=   4.8s
[CV 4/5] END .....max_depth=4, n_estimators=250;, score=0.066 total time=   1.6s
[CV 3/5] END ....max_depth=4, n_estimators=1000;, score=0.065 total time=   6.0s
[CV 3/5] END .....max_depth=8, n_estimators=250;, score=0.064 total time=   1.9s
[CV 4/5] END .....max_depth=