# Data Drift Notebook

Download File For ml-100k

In [16]:
import requests
import zipfile
import os
from pathlib import Path

# Create data directory if it doesn't exist
Path('data').mkdir(exist_ok=True)

# Download the zip file
url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
response = requests.get(url)

# Save the zip file temporarily
zip_path = "data/ml-25.zip"
with open(zip_path, 'wb') as f:
    f.write(response.content)

# Extract the contents to the data folder
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('data')

# Remove the temporary zip file
os.remove(zip_path)

In [2]:
import pandas as pd
ratings_df = pd.read_csv('data/ml-100k/u.data',sep='\t',names=['userId','movieId','rating','timestamp'])

Can limit size to 10k to speed up

In [3]:
ratings_df = ratings_df.iloc[:10000]

## Introducing Drift

In [4]:
# Simulate user preference drift by increasing the ratings for Action movies
# For simplicity, let's assume movie IDs between 1 and 100 are Action movies
action_movie_ids = list(range(1, 101))
# Introduce drift: increase the ratings of Action movies by a fixed amount (e.g., +1)
def introduce_user_drift(df, movie_ids, drift_amount=1):
    # Select rows where the movie is an Action movie
    drift_indices = df['movieId'].isin(movie_ids)
    df.loc[drift_indices, 'rating'] = df.loc[drift_indices, 'rating'] + drift_amount
    df['rating'] = df['rating'].clip(1, 5)  # Clip ratings to stay within the 1-5 range

    return df

# Apply drift to simulate user preferences shifting toward Action movies
movie_ratings_with_drift = introduce_user_drift(ratings_df, action_movie_ids)

# Check the drifted ratings
print(movie_ratings_with_drift.head())

   userId  movieId  rating  timestamp
0     196      242       3  881250949
1     186      302       3  891717742
2      22      377       1  878887116
3     244       51       3  880606923
4     166      346       1  886397596


## Train Model for data with/without drift

In [5]:
import torch
from torch.utils.data import Dataset
class datasetReader(Dataset):
    def __init__(self, df, dataset_name):
        self.df = df
        self.name = dataset_name
        print(f"{self.name} : {self.df.shape[0]}")

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        sd = self.df.iloc[idx]
        # Convert numpy.int64 to Python int
        user = int(sd['userId'])
        item = int(sd['movieId'])
        rating = float(sd['rating'])
        return torch.tensor(user-1).long(), torch.tensor(item-1).long(), torch.tensor(rating).float()


In [6]:
def train_model(train_df,n_users,n_items,hot_reload_model_run_id=None,
                model_embedding_factors=20, model_learning_rate=1e-3,model_hidden_dims=256, model_dropout_rate=0.2,
                optimizer_step_size=10, optimizer_gamma=0.1,
                training_epochs=5,
                train_batch_size=64, shuffle_training_data=True):
    input_params = {}
    for k, v in locals().items():
        if k == 'input_params':
            continue
        input_params[k] = v
    import torch
    from torch.utils.data import DataLoader

    class MatrixFactorization(torch.nn.Module):
        def __init__(self, n_users, n_items, n_factors, hidden_dim, dropout_rate):
            super().__init__()
            self.n_items = n_items
            self.user_factors = torch.nn.Embedding(n_users+1, 
                                               n_factors,
                                               sparse=False)
            self.item_factors = torch.nn.Embedding(n_items+1, 
                                               n_factors,
                                               sparse=False)
        
            self.linear = torch.nn.Linear(in_features=n_factors, out_features=hidden_dim)
            self.linear2 = torch.nn.Linear(in_features=hidden_dim, out_features=1)
            self.dropout = torch.nn.Dropout(p=dropout_rate)
            self.relu = torch.nn.ReLU()
        
        def forward(self, user, item):
            user_embedding = self.user_factors(user)
            item_embedding = self.item_factors(item)
            embeddding_vector = torch.mul(user_embedding, item_embedding)
            x = self.relu(self.linear(embeddding_vector))
            x = self.dropout(x)
            rating = self.linear2(x)
            return rating


    if hot_reload_model_run_id is not None:
        model_uri = f"runs:/{hot_reload_model_run_id}/model"
        model = torch.load(model_uri)
    else:
        model = MatrixFactorization(n_users, n_items, n_factors=model_embedding_factors, hidden_dim=model_hidden_dims, dropout_rate=model_dropout_rate)

    optimizer = torch.optim.SGD(model.parameters(), lr=model_learning_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=optimizer_step_size, gamma=optimizer_gamma)
    loss_func = torch.nn.L1Loss()

    train_dataloader = DataLoader(datasetReader(train_df, 'train'), batch_size=train_batch_size, shuffle=shuffle_training_data)

    for train_iter in range(training_epochs):
        print(train_iter)
        model.train()
        print("training")
        t_loss = 0
        t_count = 0
        for row, col, rating in train_dataloader:
            prediction = model(row, col)
            loss = loss_func(prediction, rating.unsqueeze(1))
            t_loss += loss
            t_count += 1

            # Backpropagate
            loss.backward()

            # Update the parameters
            optimizer.step()
            optimizer.zero_grad()

        scheduler.step()


    return model


In [7]:
n_users = ratings_df.userId.max()
n_items = ratings_df.movieId.max()
model = train_model(ratings_df, n_users, n_items)
model_with_drift = train_model(movie_ratings_with_drift, n_users, n_items)


train : 10000
0
training
1
training
2
training
3
training
4
training
train : 10000
0
training
1
training
2
training
3
training
4
training


## Retreiving the item embedding layers and converting them to Numpy array

In [8]:
embedding_layer = model.item_factors.weight
embedding_layer_with_drift = model_with_drift.item_factors.weight



In [9]:
embedding_layer_numpy = embedding_layer.detach().cpu().numpy()
embedding_layer_with_drift_numpy = embedding_layer_with_drift.detach().cpu().numpy()

In [10]:
df_item_factors_t1 = pd.DataFrame(embedding_layer_numpy.T, columns=[f'item_factor_{i}' for i in range(embedding_layer_numpy.shape[0])])
df_item_factors_t2 = pd.DataFrame(embedding_layer_with_drift_numpy.T, columns=[f'item_factor_{i}' for i in range(embedding_layer_numpy.shape[0])])

## Run Deepchecks feature drift check

In [11]:
from deepchecks.tabular import Dataset
dataset_item_factors_t1 = Dataset(df_item_factors_t1, label=None)
dataset_item_factors_t2 = Dataset(df_item_factors_t2, label=None)



In [13]:
from deepchecks.tabular.checks import FeatureDrift
drift_check_item = FeatureDrift(columns=["item_factor_2"]).run(dataset_item_factors_t1, dataset_item_factors_t2)

Save report as HTML

In [None]:
drift_check_item.save_as_html('drift_check_item.html')

'drift_check_item.html'