# Growth Model Helper Functions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pmdarima.arima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
from sklearn import linear_model
from sklearn.metrics import mean_absolute_percentage_error
import mlflow

In [2]:
def add_future_rows(df, num_rows):
    # Convert 'date' column to datetime type
    df['date'] = pd.to_datetime(df['date'])

    # Get the maximum tenure for each customer id, managed service, and geo combination
    max_tenure = df.groupby(['account_name'])['tenure'].max()

    new_rows = []
    for idx, max_tenure in max_tenure.items():
        customer_id = idx
        # Get the last row for the account id
        last_row = df[df['account_name'] == customer_id].iloc[-1]

        for i in range(num_rows):
            new_row = last_row.copy()
            new_row['date'] = last_row['date'] + pd.DateOffset(months=i+1)
            new_row['tenure'] = max_tenure + i + 1
            new_row['value'] = np.nan  # Set curr_mnth_rev to NaN for new rows
            new_rows.append(new_row)

    # Create a DataFrame from the new rows
    new_rows_df = pd.DataFrame(new_rows)

    # Concatenate the new rows with the original dataframe
    df = pd.concat([df, new_rows_df], ignore_index=True)

    # Sort the dataframe by customer_id, managed_service, geo, and date
    df.sort_values(['account_name', 'date'], inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df

In [10]:
def symmetric_mean_absolute_percentage_error(actual, forecast):
    actual = np.array(actual)
    forecast = np.array(forecast)
    result = np.mean(np.abs(actual - forecast) / ((np.abs(actual) + np.abs(forecast)) / 2))
    return 0 if np.isnan(result) else result

In [3]:
def forecast(g, train_start_date, train_end_date, test_start_date, test_end_date, forecast_start_date, forecast_end_date, regressor_cols):
    # filter data based on dates
    train_data = g.loc[(g.index.get_level_values('date') >= train_start_date) & (g.index.get_level_values('date') <= train_end_date)]
    test_data = g.loc[(g.index.get_level_values('date') >= test_start_date) & (g.index.get_level_values('date') <= forecast_end_date)]
    #forecast_data = g.loc[(g.index.get_level_values('date') >= forecast_start_date) & (g.index.get_level_values('date') <= forecast_end_date)]

    # auto arima
    model = auto_arima(train_data['value'], exogenous=train_data[regressor_cols],
                       seasonal=False, trace=False,
                       suppress_warnings=True, error_action="ignore", stepwise=True,
                       start_p=0, start_q=0, max_order=None, method='nm',
                       maxiter=100, n_jobs=-1) #Akaike Information Criterion.

    # fit model on the training dataset only
    model.fit(train_data['value'], exogenous=train_data[regressor_cols]) 

    # predict values for test and forecasting dataset
    valid = model.predict(n_periods=len(test_data), exogenous=test_data[regressor_cols])
    #forecast = model.predict(n_periods=len(forecast_data), exogenous=forecast_data[regressor_cols])
    
    valid = list(valid)
    #forecast= list(forecast)
    #print("Valid:", valid)

    # calculate monthly MAPE and SMAPE
    monthly_mape = pd.DataFrame({'MAPE': np.zeros(len(valid))}, index=test_data.index)
    monthly_smape = pd.DataFrame({'SMAPE': np.zeros(len(valid))}, index=test_data.index)

    for i in range(3):
        #print("test_data",test_data.iloc[0,1])
        #print(valid[i:i+1])
        monthly_mape.iloc[i] = mean_absolute_percentage_error(test_data.iloc[i:i+1, 1], valid[i:i+1])
        #print(monthly_mape.iloc[i])
        monthly_smape.iloc[i] = symmetric_mean_absolute_percentage_error(test_data.iloc[i:i+1, 1], valid[i:i+1])
        
    # create a new dataframe with the forecasted values, MAPE and SMAPE
    result = pd.DataFrame({'validated_value': valid,
                           'MAPE_monthly': monthly_mape['MAPE'],
                           'SMAPE_monthly': monthly_smape['SMAPE']},
                          index=test_data.index)
    
    # create a new dataframe with the forecasted values, MAPE and SMAPE
    #forecast = pd.DataFrame({'forecasted_curr_mnth_rev': forecast},
    #                         index=forecast_data.index)

    # merge the result with the original group
    result = pd.concat([g, result], axis=1)
    #result = pd.concat([forecast, result], axis=1)

    return result

In [8]:
def log_mlflow(cs_type, experiment, mlflow_run_name, median_mape_df):
# Create a new run in mlflow
    with mlflow.start_run(experiment_id = experiment.experiment_id, run_name=mlflow_run_name):
        parameters ={
            'cloud_service_type': cs_type,
            'model': 'ARIMA Regressor'
        }
        
        mlflow.log_params(parameters)
            
        metrics ={
            'MAPE_month1': median_mape_df[0],
            'MAPE_month2': median_mape_df[1], 
            'MAPE_month3': median_mape_df[2]
        }
        
        mlflow.log_metrics(metrics)

In [17]:
def plot_time_series(df ,account_name):
    # Load the dataset into a pandas DataFrame

    # Filter the data based on customer_id, service, and location
    filtered_data = df[(df['account_name'] == account_name)]
    
    valid_data = filtered_data[(filtered_data['month'] >= '2015-01-01') & ((filtered_data['month'] <= '2015-03-01'))]
    pred_data = filtered_data[(filtered_data['month'] >= '2015-04-01') & ((filtered_data['month'] <= '2015-09-01'))]

    # Plotting revenue, validation, and forecasted revenue
    plt.figure(figsize=(16, 8))
    plt.plot(filtered_data['month'], filtered_data['actual_mnth_value$'], marker='o', label='actual_mnth_value$')
    plt.plot(valid_data['month'], valid_data['predicted_mnth_value$'], marker='o', label='valid_mnth_value$')
    plt.plot(pred_data['month'], pred_data['predicted_mnth_value$'], marker='o', label='predicted_mnth_value$')

    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Revenue', fontsize=12)
    plt.title(f'Time Series Analysis for Customer: {account_name}')
    plt.legend()
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()

In [1]:
# Function to calculate the slope and median predicted monthly revenue for each customer prediction curve
def find_slope_and_median(x):
    from sklearn import linear_model
    reg = linear_model.LinearRegression()
    reg.fit(x['date_ordinal'].values.reshape(-1, 1), x['predicted_mnth_value$'].values)
    slopes = reg.coef_
    median_predicted_value = x['predicted_mnth_value$'].median()
    return pd.Series([slopes[0], median_predicted_value], index=['slope', 'median_predicted_value'])

In [2]:
def create_cluster_and_label_based_on_centroids(df, n):
    from sklearn.cluster import KMeans
    X = df[['slope_scaled']].values.reshape(-1, 1)
    kmeans_model = KMeans(n_clusters=n, init='k-means++', random_state=0)
    kmeans_model.fit(X)
    centroids = kmeans_model.cluster_centers_
    centroid_indices = np.argsort(centroids[:, 0])
    cluster_labels = np.zeros_like(kmeans_model.labels_)
    label_mapping = {0: "low", 1: "medium", 2: "high"}
    for i, centroid_index in enumerate(centroid_indices):
        cluster_labels[kmeans_model.labels_ == centroid_index] = i
    cluster_labels = np.vectorize(label_mapping.get)(cluster_labels)
    df['cluster_label'] = cluster_labels
    return df

In [6]:
# df of accounts wrt labels
def list_account_with_labels(df,label):
    return df[df['cluster_label'] == label].sort_values('median_predicted_value', ascending=False)