In [None]:
#################### usage data simulation

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
from scipy.stats import zscore

# Load the Excel file
def load_contract_data(file_path):
    df_contracts = pd.read_excel(file_path)
    df_contracts["contract_start_date"] = pd.to_datetime(df_contracts["contract_start_date"])
    df_contracts["contract_end_date"] = pd.to_datetime(df_contracts["contract_end_date"])
    
    # Define contract_status based on ischr, isup, and isrenew
    conditions = [
        (df_contracts["ischr"] == 1),
        (df_contracts["isup"] == 1)
    ]
    choices = ["churned", "upsell"]
    df_contracts["contract_status"] = np.select(conditions, choices, default="unknown")
    
    return df_contracts

# Function to generate product usage data
def generate_usage_data(df_contracts):
    expanded_rows = []
    
    for _, row in df_contracts.iterrows():
        total_user = np.ceil(row["clv_contract"] / 4000)  # Roundup based on CLV
        prev_feature_adoption = 1  # Start with at least 1 feature adopted
        prev_active_user, prev_session_duration, prev_usage_frequency = None, None, None
        
        contract_usage_data = []

        for month in range(1, 13):
            FOM = row["contract_start_date"].replace(day=1) + pd.DateOffset(months=month - 1)  # First of the month
            MOC = month  # Month of contract
            POC = MOC / 12  # Percentage of contract duration

            active_user = np.random.randint(1, total_user)  # Active users (less than total users)
            session_duration = np.random.randint(60, 60 * 8)  # Session duration (60 sec to 8 hours)
            
            # Incremental feature adoption: stays the same or increases
            feature_adoption = np.random.randint(prev_feature_adoption, 13)  
            prev_feature_adoption = feature_adoption  # Update for next month

            usage_frequency = np.random.randint(1, 13)  # Usage frequency (1 to 12 sessions per month)
            usage_recency = FOM + timedelta(days=np.random.randint(0, 28))  # Random date within the month
            usage_recency_d = (usage_recency - FOM).days  # Calculate recency difference in days

            # Calculate month-over-month changes
            mom_active_user = active_user - prev_active_user if prev_active_user is not None else 0
            mom_session_duration = session_duration - prev_session_duration if prev_session_duration is not None else 0
            mom_feature_adoption = feature_adoption - prev_feature_adoption if prev_feature_adoption is not None else 0
            mom_usage_frequency = usage_frequency - prev_usage_frequency if prev_usage_frequency is not None else 0
            
            # Update previous values
            prev_active_user, prev_session_duration, prev_feature_adoption, prev_usage_frequency = (
                active_user, session_duration, feature_adoption, usage_frequency
            )

            contract_usage_data.append([
                row["account_id"], row["contract_id"], FOM, MOC, POC, total_user,
                active_user, session_duration, feature_adoption,
                usage_frequency, usage_recency, usage_recency_d,
                mom_active_user, mom_session_duration, mom_feature_adoption, mom_usage_frequency,
                row["contract_end_date"], row["contract_status"]
            ])

        # Convert to dataframe
        df_contract = pd.DataFrame(contr2
                                   act_usage_data, columns=[
            "account_id", "contract_id", "FOM", "MOC", "POC", "total_user",
            "active_user", "session_duration", "feature_adoption",
            "usage_frequency", "usage_recency", "usage_recency_d",
            "mom_active_user", "mom_session_duration", "mom_feature_adoption", "mom_usage_frequency",
            "contract_end_date", "contract_status"
        ])

        expanded_rows.extend(df_contract.values.tolist())
    
    # Create the final dataframe
    columns = df_contract.columns.tolist()
    df_expanded = pd.DataFrame(expanded_rows, columns=columns)
    
    return df_expanded

 
# Example usage
file_path =  "data/0_sim_contract.xlsx"
df_contracts = load_contract_data(file_path)
final_features_df = generate_usage_data(df_contracts)
final_features_df['pre_active_users'] = final_features_df['active_user'] / final_features_df['total_user']
final_features_df['FOM'] = pd.to_datetime(final_features_df['FOM'], errors='coerce')
final_features_df['contract_year'] = final_features_df['FOM'].dt.year

In [386]:
final_features_df.to_csv("data/0_sim_usage_monthly_features.csv", index=False)

In [332]:
final_features_df = pd.read_csv("data/0_sim_usage_monthly_features.csv")

In [384]:
final_features_df.shape

(17436, 20)

In [358]:
final_features_df.columns

Index(['account_id', 'contract_id', 'FOM', 'MOC', 'POC', 'total_user',
       'active_user', 'session_duration', 'feature_adoption',
       'usage_frequency', 'usage_recency', 'usage_recency_d',
       'mom_active_user', 'mom_session_duration', 'mom_feature_adoption',
       'mom_usage_frequency', 'contract_end_date', 'contract_status',
       'pre_active_users', 'contract_year'],
      dtype='object')

In [234]:
######################## calculate features

In [360]:
data=final_features_df.copy()

In [390]:
features=['active_user', 'session_duration', 'feature_adoption', 'usage_frequency', 'usage_recency_d', 'pre_active_users']
# Filter data for ischr == True
data_chr = data[data['contract_status'] == 'churned']
stats_chr = data_chr.groupby(['contract_year'])[features].agg(['min', 'mean', 'median', 'max', 'std']).reset_index()

# Filter data for has_next_contract == True
data_renew = data[data['contract_status'] == 'upsell']
stats_renew = data_renew.groupby(['contract_year'])[features].agg(['min', 'mean', 'median', 'max', 'std']).reset_index()

# Initialize an empty dictionary to store results
stats_dict = {}

# Loop through both stats for 'ischr' and 'has_next_contract' and add to the dictionary
for stat_df, key in zip([stats_chr, stats_renew], ['ischr', 'isup']):
    # Loop through features and stats
    for feature in features:
        for stat in ['min', 'mean', 'median', 'max', 'std']:
            # Create a key combining feature, stat, contract_year, and 'ischr' or 'has_next_contract'
            for contract_year in stat_df['contract_year']:
                # Add each result to the dictionary
                stats_dict[(feature, stat, contract_year, key)] = stat_df.loc[stat_df['contract_year'] == contract_year, (feature, stat)].values[0]

In [392]:
#stats_dict
df = pd.DataFrame(
    [(feature, stat, year, category, value) for (feature, stat, year, category), value in stats_dict.items()],
    columns=['Feature', 'Statistic', 'Year', 'Category', 'Value']
)
df.to_csv("data/0_sim_usage_yearly_fuzzy.csv", index=False)

In [394]:
# Iterate through each feature in ff
for f in features:
    # Iterate through each row in data
    for idx, row in data.iterrows():
        year = 0
        label = ''
        
        # Determine the contract year, later change this
               
        if row['contract_year'] == data['contract_year'].min():
            year = row['contract_year']
        else:
            year = row['contract_year'] - 1

        # Get the median values from stats_dict for 'ischr' and 'has_next_contract'
        median_ischr = stats_dict.get((f, 'median', year, 'ischr'), None)
        median_renew = stats_dict.get((f, 'median', year, 'isup'), None)
        # Check if the median values exist, otherwise assign 'normal'
        if median_ischr is not None and row[f] < median_ischr:
            label = 'risky'
        elif median_renew is not None and row[f] > median_renew:
            label = 'healthy'
        elif row[f] >= median_ischr and row[f]<= median_renew:
            label = 'normal'
        else:
            label = 'noun'

        # Assign the label to the row (new column with label for that feature)
        data.at[idx, f + '_label'] = label
        data.at[idx, f + '_median_ischr'] = median_ischr
        data.at[idx, f + '_median_isup'] = median_renew

In [395]:
data.shape

(17436, 44)

In [396]:
data.to_csv("data/0_sim_usage_monthly_features_agg.csv", index=False)

In [400]:
data.columns

Index(['account_id', 'contract_id', 'FOM', 'MOC', 'POC', 'total_user',
       'active_user', 'session_duration', 'feature_adoption',
       'usage_frequency', 'usage_recency', 'usage_recency_d',
       'mom_active_user', 'mom_session_duration', 'mom_feature_adoption',
       'mom_usage_frequency', 'contract_end_date', 'contract_status',
       'pre_active_users', 'contract_year', 'active_user_label',
       'active_user_median_ischr', 'active_user_median_renew',
       'session_duration_label', 'session_duration_median_ischr',
       'session_duration_median_renew', 'feature_adoption_label',
       'feature_adoption_median_ischr', 'feature_adoption_median_renew',
       'usage_frequency_label', 'usage_frequency_median_ischr',
       'usage_frequency_median_renew', 'usage_recency_d_label',
       'usage_recency_d_median_ischr', 'usage_recency_d_median_renew',
       'pre_active_users_label', 'pre_active_users_median_ischr',
       'pre_active_users_median_renew', 'active_user_median_isu

In [428]:
a=[]
for i in  data.columns.tolist():
    if ('label' in i ):
        a.append(i)
aa=a+['ischr','isup']

In [408]:
data['ischr'] = (data['contract_status'] == 'churned').astype(int)

In [410]:
# Define a mapping function to convert categorical values into -1, 0, 1
def map_categories_to_numeric(value):
    if value == 'risky':  # Risky category mapped to -1
        return -1
    elif value == 'healthy':  # Healthy category mapped to 1
        return 1
    else:  # Neutral or normal categories mapped to 0
        return 0

# Apply the mapping function to the categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    data[col] = data[col].apply(map_categories_to_numeric)

weights = {}
for col in a:
    # Calculate the correlation of each feature with the target variable 'ischr'
    correlation = data[col].corr(data['ischr'])
    weights[col] = correlation

# Create a DataFrame to display the feature weights
weights_df = pd.DataFrame(list(weights.items()), columns=['Feature', 'Weight'])
weights_df

wl=[]
for i in weights_df['Weight'].tolist():
    #wl.append(i/sum(l))
    wl.append(1/len(a))
data['score']=0
for i,j in zip(a,wl):
    data['score']=data['score']+data[i]*j

data['usage_score_label'] = data['score'].apply(lambda x: 'healthy' if x > 0 else ('normal' if x == 0 else 'risky'))

  c /= stddev[:, None]
  c /= stddev[None, :]


In [424]:
data['usage_score_label'].value_counts()

usage_score_label
risky      8817
healthy    7039
normal     1580
Name: count, dtype: int64

In [426]:
data.to_csv('data/1_score_usage.csv', index=False)

In [None]:
Possible Scenarios
Churn Z-score	Upsell Z-score	Interpretation
High (+)	High (+)	Strong user, likely to expand
High (+)	Low (-)	Unlikely to churn, but not an upsell candidate
Low (-)	High (+)	At risk of churn, but usage pattern suggests upsell potential
Low (-)	Low (-)	High churn risk, unlikely to upsell
