## **HOMEWORK 2**

1. Which columns have drift?
2. Approximately when does the column drift? 
3. How many times does the column drift?

In [1]:
import pandas as pd

In [7]:
df = pd.read_csv("Users/shruti/Downloads/synthetic_data.csv")
df.head()

Unnamed: 0,datetime,num_feature_1,num_feature_2,num_feature_3,num_feature_4,num_feature_5,num_feature_6,num_feature_7,num_feature_8,num_feature_9,num_feature_10,cat_feature_1,cat_feature_2,cat_feature_3
0,1/1/23 0:00,0.496714,0.029431,-0.567379,1.316345,0.981826,0.245528,-0.295666,0.587933,-1.207989,-0.362931,A,X,C
1,1/1/23 0:10,-0.138264,0.516644,-0.362385,-2.908661,0.999686,0.558558,0.383567,-0.17922,-1.071735,-0.673647,B,Z,C
2,1/1/23 0:20,0.647689,-0.981326,0.099534,-1.22275,0.040974,0.590688,1.039632,0.962875,0.379122,-1.458926,C,X,A
3,1/1/23 0:30,1.52303,-1.404817,0.21803,-0.915632,0.208394,-1.408984,-1.036513,-1.255305,1.052502,-2.704705,C,Z,B
4,1/1/23 0:40,-0.234153,0.107208,0.434959,-0.819715,-1.429766,0.824087,1.01091,-0.535573,1.037141,-1.826844,C,X,B


In [8]:
import pandas as pd
from scipy.stats import ks_2samp, chi2_contingency
import matplotlib.pyplot as plt

# Assuming the dataset is loaded into a pandas DataFrame called df
# Convert 'datetime' column to datetime format
df['datetime'] = pd.to_datetime(df['datetime'])

# Set the 'datetime' column as the index for easier time slicing
df.set_index('datetime', inplace=True)

# Create the reference set (first 7 days of the dataset)
reference_set = df.loc['2023-01-01':'2023-01-07']

# Define the window size and the step size (1 day shift)
window_size = 7
step_size = 1

# Function to check drift for numerical columns using the KS test
def check_numerical_drift(reference_set, df, window_size, step_size):
    drift_results = {}
    
    numerical_columns = [col for col in df.columns if 'num_feature' in col]
    
    # Iterate through the numerical columns
    for col in numerical_columns:
        drift_results[col] = []
        
        # Iterate through the test sets
        for start_day in range(0, len(df) - window_size, step_size):
            end_day = start_day + window_size
            test_set = df.iloc[start_day:end_day]
            
            # Perform KS test on the reference and test set for each numerical column
            ks_stat, p_value = ks_2samp(reference_set[col], test_set[col])
            
            # If the p-value is less than 0.05, there is significant drift
            if p_value < 0.05:
                drift_results[col].append((df.index[start_day], p_value))
    
    return drift_results

# Function to check drift for categorical columns using the Chi-square test
def check_categorical_drift(reference_set, df, window_size, step_size):
    drift_results = {}
    
    categorical_columns = [col for col in df.columns if 'cat_feature' in col]
    
    # Iterate through the categorical columns
    for col in categorical_columns:
        drift_results[col] = []
        
        # Iterate through the test sets
        for start_day in range(0, len(df) - window_size, step_size):
            end_day = start_day + window_size
            test_set = df.iloc[start_day:end_day]
            
            # Create contingency table for chi-square test
            contingency_table = pd.crosstab(reference_set[col], test_set[col])

            # Skip if the contingency table is empty (no data for this category in either set)
            if contingency_table.size == 0:
                print(f"Warning: Empty contingency table for {col} between reference and test set, skipping test.")
                continue

            # Perform Chi-Square test on the contingency table
            chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
            
            # If the p-value is less than 0.05, there is significant drift
            if p_value < 0.05:
                drift_results[col].append((df.index[start_day], p_value))
    
    return drift_results


# Check for drift in numerical columns
numerical_drift = check_numerical_drift(reference_set, df, window_size, step_size)

# Check for drift in categorical columns
categorical_drift = check_categorical_drift(reference_set, df, window_size, step_size)

# Print results for numerical drift
print("Numerical Drift Results:")
for col, drift_info in numerical_drift.items():
    print(f"\nColumn: {col}")
    if drift_info:
        for drift in drift_info:
            print(f"Drift detected on {drift[0]} with p-value: {drift[1]:.4f}")
    else:
        print("No drift detected.")

# Print results for categorical drift
print("\nCategorical Drift Results:")
for col, drift_info in categorical_drift.items():
    print(f"\nColumn: {col}")
    if drift_info:
        for drift in drift_info:
            print(f"Drift detected on {drift[0]} with p-value: {drift[1]:.4f}")
    else:
        print("No drift detected.")


  df['datetime'] = pd.to_datetime(df['datetime'])


Numerical Drift Results:

Column: num_feature_1
Drift detected on 2023-01-01 01:40:00 with p-value: 0.0242
Drift detected on 2023-01-01 02:10:00 with p-value: 0.0103
Drift detected on 2023-01-01 16:20:00 with p-value: 0.0327
Drift detected on 2023-01-02 02:50:00 with p-value: 0.0277
Drift detected on 2023-01-02 10:10:00 with p-value: 0.0159
Drift detected on 2023-01-02 10:20:00 with p-value: 0.0007
Drift detected on 2023-01-02 10:30:00 with p-value: 0.0007
Drift detected on 2023-01-02 10:40:00 with p-value: 0.0150
Drift detected on 2023-01-02 10:50:00 with p-value: 0.0116
Drift detected on 2023-01-03 14:00:00 with p-value: 0.0230
Drift detected on 2023-01-03 15:00:00 with p-value: 0.0142
Drift detected on 2023-01-03 15:10:00 with p-value: 0.0142
Drift detected on 2023-01-04 08:10:00 with p-value: 0.0056
Drift detected on 2023-01-04 16:30:00 with p-value: 0.0444
Drift detected on 2023-01-04 21:50:00 with p-value: 0.0472
Drift detected on 2023-01-05 03:50:00 with p-value: 0.0338
Drift de

### **Evidence for Drift**