# DATA DRIFT DETECTION

### use the Evidently python library to generate reports about data drift

In [50]:
import pandas as pd
import numpy as np
from scipy.stats import wasserstein_distance, energy_distance,  ks_2samp

In [51]:
df = pd.read_csv("preprocessed_project_data.csv")

### drop leakage columns, clean and convert the date column and gropby the date

In [52]:
# Drop leakage columns
leak_cols = ['trend_label_encoded', 'rel_combo', 'likes_per_day', 'share_rate_log', 'views', 'views_per_day', 'like_rate', 'comment_rate', 'share_rate', 'like_rate_log', 'comment_rate_log', 'rel_like', 'rel_share', 'avg_velocity_y', 'avg_er', 'richness_traffic_interaction', 'country_x', 'country_y', 'top_hashtag', 'top_hashtag', 'like_rate_zscore', 'comment_rate_zscore', 'share_rate_zscore', 'traffic_source_feed', 'traffic_source_hashtag', 'traffic_source_profile', 'traffic_source_search','traffic_source_share', 'platform_x', 'platform_y', 'creator_tier', 'title_len','text_richness', 'traffic_source_cat', 'device_brand_cat', 'creator_tier_cat', 'like_hashtag_interaction', 'share_hashtag_interaction' ]
df = df.drop(columns=[c for c in leak_cols if c in df.columns])

df = df.dropna(subset=['trend_label'])
df = df.reset_index(drop=True)

# Convert and clean date
df['year_month'] = pd.to_datetime(df['year_month'], format='%Y-%m', errors='coerce')
df = df.dropna(subset=['year_month'])

print(f"Data Shape: {df.shape}")
print("Unique Months:", sorted(df['year_month'].unique()))

Data Shape: (1367692, 39)
Unique Months: [Timestamp('2025-02-01 00:00:00'), Timestamp('2025-03-01 00:00:00'), Timestamp('2025-04-01 00:00:00'), Timestamp('2025-05-01 00:00:00'), Timestamp('2025-06-01 00:00:00'), Timestamp('2025-07-01 00:00:00'), Timestamp('2025-08-01 00:00:00')]


In [53]:
df.groupby('year_month')['trend_label'].value_counts(normalize=True)

year_month  trend_label
2025-02-01  declining      0.533136
            rising         0.466864
2025-03-01  rising         0.736996
            declining      0.263004
2025-04-01  declining      0.652303
            rising         0.347697
2025-05-01  rising         0.830620
            declining      0.169380
2025-06-01  declining      0.661892
            rising         0.338108
2025-07-01  rising         0.658298
            declining      0.341702
2025-08-01  rising         0.779818
            declining      0.220182
Name: proportion, dtype: float64

### Split the data chronologically into training and test data

In [54]:
# target = 'trend_label'
# months = sorted(df['year_month'].unique())
# results_all = []

# for test_month in months[2:]:  # start testing from March
#     train_months = [m for m in months if m < test_month]
#     df_train = df[df['year_month'].isin(train_months)]
#     df_test  = df[df['year_month'] == test_month]

#     X_train = df_train.drop(columns=[target])
#     y_train = df_train[target]
#     X_test  = df_test.drop(columns=[target])
#     y_test  = df_test[target]

#     if len(np.unique(y_train)) < 2 or len(np.unique(y_test)) < 2:
#         print(f"Skipping {test_month.strftime('%Y-%m')} — not enough class diversity")
#         continue

In [55]:
# df_train.head()

In [56]:
# # CHANGE THIS DEPENDING ON WHICH MONTHS YOU WANT TO USE IN THE DISTRIBUTIN -- OR ASK IF WE WANT A ROLLING WINDOW 
# train_months = months[:3]
# test_months  = months[3:]

# df_train = df[df['year_month'].isin(train_months)]
# df_test  = df[df['year_month'].isin(test_months)]

## Mean Analysis

In [57]:
df.head()

Unnamed: 0,trend_label,platform_cat,region_cat,language_cat,category_cat,year_month,n_videos,region_Americas,region_Asia,region_Europe,...,category_News,category_Sports,category_Tech,category_Travel,device_brand_google,device_brand_huawei,device_brand_oppo,device_brand_other,device_brand_samsung,device_brand_xiaomi
3,rising,-0.950645,-0.859757,-1.22854,-1.260441,2025-03-01,1.222861,True,False,False,...,False,False,False,False,False,True,False,False,False,False
4,rising,-0.950645,-0.859757,-1.22854,-1.260441,2025-08-01,1.30377,True,False,False,...,False,False,False,False,False,True,False,False,False,False
5,rising,-0.950645,-0.859757,-1.22854,-1.260441,2025-04-01,1.425135,True,False,False,...,False,False,False,False,False,True,False,False,False,False
6,rising,-0.950645,-0.859757,-1.22854,-1.260441,2025-04-01,-0.718976,True,False,False,...,False,False,False,False,False,True,False,False,False,False
7,rising,-0.950645,-0.859757,-1.22854,-1.260441,2025-06-01,-0.638066,True,False,False,...,False,False,False,False,False,True,False,False,False,False


In [58]:
df.columns

Index(['trend_label', 'platform_cat', 'region_cat', 'language_cat',
       'category_cat', 'year_month', 'n_videos', 'region_Americas',
       'region_Asia', 'region_Europe', 'region_MENA', 'region_Oceania',
       'language_de', 'language_en', 'language_es', 'language_fr',
       'language_hi', 'language_it', 'language_ja', 'language_ko',
       'language_pt', 'language_ru', 'language_tr', 'category_Comedy',
       'category_Education', 'category_Food', 'category_Gaming',
       'category_Lifestyle', 'category_Music', 'category_News',
       'category_Sports', 'category_Tech', 'category_Travel',
       'device_brand_google', 'device_brand_huawei', 'device_brand_oppo',
       'device_brand_other', 'device_brand_samsung', 'device_brand_xiaomi'],
      dtype='object')

In [59]:
# List of boolean feature columns
binary_cols = [
    'region_Americas', 'region_Asia', 'region_Europe', 'region_MENA', 'region_Oceania',
    'language_de', 'language_en', 'language_es', 'language_fr',
    'language_hi', 'language_it', 'language_ja', 'language_ko',
    'language_pt', 'language_ru', 'language_tr',
    'category_Comedy', 'category_Education', 'category_Food', 'category_Gaming',
    'category_Lifestyle', 'category_Music', 'category_News',
    'category_Sports', 'category_Tech', 'category_Travel',
    'device_brand_google', 'device_brand_huawei', 'device_brand_oppo',
    'device_brand_other', 'device_brand_samsung', 'device_brand_xiaomi'
]

# Convert TRUE/FALSE to 1/0
df[binary_cols] = df[binary_cols].astype(int)

# Convert trend_label (“rising”, “declining”) to binary
df['trend_label'] = df['trend_label'].map({'rising': 1, 'declining': 0})


In [60]:
df_monthly_mean = df.groupby('year_month').mean().T

df_monthly_mean

year_month,2025-02-01,2025-03-01,2025-04-01,2025-05-01,2025-06-01,2025-07-01,2025-08-01
trend_label,0.466864,0.736996,0.347697,0.83062,0.338108,0.658298,0.779818
platform_cat,0.135302,-0.119135,0.247545,-0.207319,0.256577,-0.045009,-0.159469
region_cat,0.002686,-0.001883,0.004702,-0.003467,0.004864,-0.000552,-0.002608
language_cat,0.001137,-0.001219,0.002177,-0.002035,0.00226,-0.000532,-0.001592
category_cat,-0.004913,0.005027,-0.009298,0.008472,-0.009651,0.002131,0.006602
n_videos,-0.475327,0.055361,-0.072821,0.29052,-0.206633,-0.249239,0.520044
region_Americas,0.287215,0.284289,0.288505,0.283275,0.288609,0.285141,0.283825
region_Asia,0.259098,0.264987,0.256501,0.267028,0.256292,0.263271,0.26592
region_Europe,0.225723,0.222742,0.227038,0.221709,0.227144,0.223611,0.22227
region_MENA,0.080667,0.081261,0.080404,0.081467,0.080383,0.081088,0.081355


it's very interesting that the means for all of the categorical variables are very similar to each other. This is most likely due to the data collection methods which we could not control. It does make it very difficult for us to use it to predict trend labels, but it could show that we do in fact see conceptual drift or target drift in this data set. 

## 2-sample Kolmogorov-Smirnov (KS) test

- for continuous variables 

our only continuous variable is n_videos so we can test on that variable using KS test but not on the others.

In [61]:
# train_months = months[:3]
# test_months  = months[3:]

# df_train = df[df['year_month'].isin(train_months)]
# df_test  = df[df['year_month'].isin(test_months)]

In [62]:
month_list = list(df['year_month'].unique())
d = {month: pd.DataFrame() for month in month_list}
for month in month_list:
    d[month] = df[df['year_month']==month]

In [63]:
from scipy.stats import ks_2samp
import numpy as np
import pandas as pd

# sort months
month_list = sorted(df['year_month'].unique())

# Create dictionary mapping each month to its data
d = {month: df[df['year_month'] == month].copy() for month in month_list}

# store KS results in dfs 
KS_stat  = pd.DataFrame(columns=['KS_stat', 'p_value', 'critical_region'], index=month_list)
KS_stat2 = pd.DataFrame(columns=['KS_stat', 'critical_region'], index=month_list)

# use first month as reference
reference_month = month_list[0]  # first month
reference = d[reference_month]

# only use n_videos in KS test because it is the only continuous var
ref_values = reference['n_videos']
n = len(ref_values)

# calculate ks scores for each month 
for date in month_list:
    
    current_values = d[date]['n_videos']
    m = len(current_values)

    # Critical region for KS
    crit_reg = 1.731 * np.sqrt((n + m) / (n * m))
    
    # Compute KS test
    stat, p_value = ks_2samp(ref_values, current_values)

    # Store full results
    KS_stat.loc[date] = [round(stat,4), round(p_value,4), round(crit_reg,4)]

    # Store only KS stat + critical region
    KS_stat2.loc[date] = [round(stat,4), round(crit_reg,4)]


In [64]:
KS_stat

Unnamed: 0,KS_stat,p_value,critical_region
2025-02-01,0.0,1.0,0.0067
2025-03-01,0.2986,0.0,0.0058
2025-04-01,0.2413,0.0,0.0062
2025-05-01,0.5254,0.0,0.0062
2025-06-01,0.2118,0.0,0.0062
2025-07-01,0.2126,0.0,0.0061
2025-08-01,0.5495,0.0,0.0061


In [65]:
KS_stat2

Unnamed: 0,KS_stat,critical_region
2025-02-01,0.0,0.0067
2025-03-01,0.2986,0.0058
2025-04-01,0.2413,0.0062
2025-05-01,0.5254,0.0062
2025-06-01,0.2118,0.0062
2025-07-01,0.2126,0.0061
2025-08-01,0.5495,0.0061


KS stat shows the maximum difference in the two distributions. This means that the distributions for n_videos are different across the months 

#### Now I want to compare the first three months to the other months 

In [66]:
from scipy.stats import ks_2samp
import numpy as np
import pandas as pd

# Ensure months are sorted
month_list = sorted(df['year_month'].unique())
# create dictionary of months like before
d = {month: df[df['year_month'] == month].copy() for month in month_list}

# create reference data
reference_months = month_list[:3]            # first 3 months
reference = pd.concat([d[m] for m in reference_months])

ref_values = reference['n_videos']
n = len(ref_values)

# create dfs to hold the KS stats 
KS_stat  = pd.DataFrame(columns=['KS_stat', 'p_value', 'critical_region'], index=month_list[3:])
KS_stat2 = pd.DataFrame(columns=['KS_stat', 'critical_region'], index=month_list[3:])

# run the KS test on the other months 
for date in month_list[3:]:   # months AFTER the first 3
    current_values = d[date]['n_videos']
    m = len(current_values)

    # Compute critical region
    crit_reg = 1.731 * np.sqrt((n + m) / (n * m))

    # Run KS test
    stat, p_value = ks_2samp(ref_values, current_values)

    KS_stat.loc[date] = [round(stat,4), round(p_value,4), round(crit_reg,4)]
    KS_stat2.loc[date] = [round(stat,4), round(crit_reg,4)]


In [67]:
KS_stat

Unnamed: 0,KS_stat,p_value,critical_region
2025-05-01,0.357,0.0,0.0047
2025-06-01,0.2033,0.0,0.0046
2025-07-01,0.2006,0.0,0.0044
2025-08-01,0.3812,0.0,0.0044


In [68]:
KS_stat2

Unnamed: 0,KS_stat,critical_region
2025-05-01,0.357,0.0047
2025-06-01,0.2033,0.0046
2025-07-01,0.2006,0.0044
2025-08-01,0.3812,0.0044


same as before, we can see the distributions are different on n_videos feature throughout the months

## First Wasserstein Distance 

## Cramér-von Mises (CM) distance (or Energy distan

## Population Stability Index (PSI) 

In [85]:
def calculate_psi(expected, actual, buckettype='bins', buckets=10, axis=0):
    '''Calculate the PSI (population stability index) across all variables
    Args:
       expected: numpy matrix of original values (Training)
       actual: numpy matrix of new values, same size as expected (Validation)
       buckettype: type of strategy for creating buckets, bins splits into even splits, quantiles splits into quantile buckets
       buckets: number of quantiles to use in bucketing variables
       axis: axis by which variables are defined, 0 for vertical, 1 for horizontal
    Returns:
       psi_values: ndarray of psi values for each variable
    Author:
       Matthew Burke
       github.com/mwburke
       worksofchart.com
    '''
    def psi(expected_array, actual_array, buckets):
        '''Calculate the PSI for a single variable
        Args:
           expected_array: numpy array of original values
           actual_array: numpy array of new values, same size as expected
           buckets: number of percentile ranges to bucket the values into
        Returns:
           psi_value: calculated PSI value
        '''

        def scale_range (input, min, max):
            input += -(np.min(input))
            input /= np.max(input) / (max - min)
            input += min
            return input

        breakpoints = np.arange(0, buckets + 1) / (buckets) * 100

        if buckettype == 'bins':
            breakpoints = scale_range(breakpoints, np.min(expected_array), np.max(expected_array))
        elif buckettype == 'quantiles':
            breakpoints = np.stack([np.percentile(expected_array, b) for b in breakpoints])



        expected_percents = np.histogram(expected_array, breakpoints)[0] / len(expected_array)
        actual_percents = np.histogram(actual_array, breakpoints)[0] / len(actual_array)

        def sub_psi(e_perc, a_perc):
            '''Calculate the actual PSI value from comparing the values.
               Update the actual value to a very small number if equal to zero
            '''
            if a_perc == 0:
                a_perc = 0.0001
            if e_perc == 0:
                e_perc = 0.0001

            value = (e_perc - a_perc) * np.log(e_perc / a_perc)
            return(value)

        psi_value = np.sum(sub_psi(expected_percents[i], actual_percents[i]) for i in range(0, len(expected_percents)))

        return(psi_value)

    if len(expected.shape) == 1:
        psi_values = np.empty(len(expected.shape))
    else:
        psi_values = np.empty(expected.shape[axis])

    for i in range(0, len(psi_values)):
        if len(psi_values) == 1:
            psi_values = psi(expected, actual, buckets)
        elif axis == 0:
            psi_values[i] = psi(expected[:,i], actual[:,i], buckets)
        elif axis == 1:
            psi_values[i] = psi(expected[i,:], actual[i,:], buckets)

    return(psi_values)


In [100]:
feature_cols = [
    'trend_label'
    # 'n_videos', 'platform_cat', 'region_cat', 'language_cat', 'category_cat',
    # 'region_Americas', 'region_Asia', 'region_Europe', 'region_MENA', 'region_Oceania',
    # 'language_de', 'language_en', 'language_es', 'language_fr',
    # 'language_hi', 'language_it', 'language_ja', 'language_ko',
    # 'language_pt', 'language_ru', 'language_tr',
    # 'category_Comedy', 'category_Education', 'category_Food', 'category_Gaming',
    # 'category_Lifestyle', 'category_Music', 'category_News',
    # 'category_Sports', 'category_Tech', 'category_Travel',
    # 'device_brand_google', 'device_brand_huawei', 'device_brand_oppo',
    # 'device_brand_other', 'device_brand_samsung', 'device_brand_xiaomi'
]

In [101]:
month_list = sorted(df['year_month'].unique())

reference_months = month_list[:3]
reference_df = df[df['year_month'].isin(reference_months)]

In [102]:
psi_results = pd.DataFrame(index=month_list[3:], columns=feature_cols)

for month in month_list[3:]:  # months AFTER first 3
    actual_df = df[df['year_month'] == month]

    for feature in feature_cols:
        psi_value = calculate_psi(reference_df[feature], actual_df[feature])
        psi_results.loc[month, feature] = psi_value

  psi_value = np.sum(sub_psi(expected_percents[i], actual_percents[i]) for i in range(0, len(expected_percents)))


In [103]:
psi_results.T

Unnamed: 0,2025-05-01,2025-06-01,2025-07-01,2025-08-01
trend_label,0.375976,0.194592,0.044442,0.233557


In [99]:
results = psi_results.T.sum()

results.to_frame().T

Unnamed: 0,2025-05-01,2025-06-01,2025-07-01,2025-08-01
0,2.599322,0.721659,0.76323,1.670747


May and August have the most drift according to PSI

In [28]:
# run it again without trend label 

In [38]:
df = df.drop('trend_label', axis=1)

KeyError: "['trend_label'] not found in axis"

In [39]:
df

Unnamed: 0,platform_cat,region_cat,language_cat,category_cat,year_month,n_videos,region_Americas,region_Asia,region_Europe,region_MENA,...,category_News,category_Sports,category_Tech,category_Travel,device_brand_google,device_brand_huawei,device_brand_oppo,device_brand_other,device_brand_samsung,device_brand_xiaomi
3,-0.950645,-0.859757,-1.228540,-1.260441,2025-03-01,1.222861,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,-0.950645,-0.859757,-1.228540,-1.260441,2025-08-01,1.303770,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,-0.950645,-0.859757,-1.228540,-1.260441,2025-04-01,1.425135,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6,-0.950645,-0.859757,-1.228540,-1.260441,2025-04-01,-0.718976,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,-0.950645,-0.859757,-1.228540,-1.260441,2025-06-01,-0.638066,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1400088,1.051917,-0.098694,0.332943,0.625944,2025-04-01,-1.325800,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1400089,1.051917,-0.098694,0.332943,0.625944,2025-07-01,-1.770804,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1400090,1.051917,-0.098694,0.332943,0.625944,2025-02-01,-0.880796,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1400091,1.051917,-0.098694,0.332943,0.625944,2025-04-01,-0.921251,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0


In [43]:
month_list = sorted(df['year_month'].unique())

reference_months = month_list[:3]
reference_df = df[df['year_month'].isin(reference_months)]

actual_df = df[~df['year_month'].isin(reference_months)]

In [44]:
columns = df.columns

In [47]:
numeric_cols = df[columns].select_dtypes(include=['number']).columns
psi_results = pd.DataFrame(index=month_list[3:], columns=numeric_cols)

for month in month_list[3:]:
    actual_df = df[df['year_month'] == month]
    for col in numeric_cols:
        psi_value = calculate_psi(reference_df[col], actual_df[col])
        psi_results.loc[month, col] = psi_value

  psi_value = np.sum(sub_psi(expected_percents[i], actual_percents[i]) for i in range(0, len(expected_percents)))


In [49]:
psi_results.T

Unnamed: 0,2025-05-01,2025-06-01,2025-07-01,2025-08-01
platform_cat,0.068488,0.042451,0.009369,0.045172
region_cat,0.000272,0.000171,3.8e-05,0.000181
language_cat,0.000204,0.000129,2.9e-05,0.000136
category_cat,0.003487,0.002298,0.000495,0.002329
n_videos,2.522398,0.673693,0.752667,1.619945
region_Americas,4.4e-05,2.7e-05,6e-06,2.9e-05
region_Asia,0.000185,0.000117,2.6e-05,0.000123
region_Europe,5.3e-05,3.3e-05,7e-06,3.5e-05
region_MENA,5e-06,3e-06,1e-06,3e-06
region_Oceania,4.7e-05,2.9e-05,7e-06,3.1e-05
