In [22]:
import pandas as pd
import numpy as np
from numpy.linalg import inv
from sklearn.preprocessing import LabelEncoder, StandardScaler

def preprocess_data(data):
    # Convert all columns to numeric
    numeric_data = data.copy()
    label_encoders = {}
    
    for column in numeric_data.columns:
        if numeric_data[column].dtype == 'object':
            le = LabelEncoder()
            numeric_data[column] = le.fit_transform(numeric_data[column].astype(str))
            label_encoders[column] = le
    
    # Scale the features
    scaler = StandardScaler()
    scaled_data = pd.DataFrame(
        scaler.fit_transform(numeric_data),
        columns=numeric_data.columns
    )
    
    return scaled_data, label_encoders, scaler

def calculate_mahalanobis_similarity(data):
    # Calculate covariance matrix
    covariance_matrix = np.cov(data.values.T)
    inv_covariance_matrix = inv(covariance_matrix)
    
    # Calculate distances between all pairs
    n = len(data)
    distances = np.zeros((n, n))
    
    for i in range(n):
        for j in range(n):
            diff = data.iloc[i].values - data.iloc[j].values
            distances[i,j] = np.sqrt(diff.dot(inv_covariance_matrix).dot(diff))
    
    # Convert distances to similarities
    similarities = np.exp(-distances)
    return similarities

def get_recommendations(user_profile, train_data, similarity_matrix, k=5):
    distances = np.array([
        np.sqrt((user_profile.iloc[0] - train_data.iloc[i]).dot(inv(np.cov(train_data.values.T))).dot(user_profile.iloc[0] - train_data.iloc[i]))
        for i in range(len(train_data))
    ])
    similarities = np.exp(-distances)
    similar_indices = np.argsort(similarities)[-k:]
    return train_data.iloc[similar_indices]

def fill_test_values(test_data, train_data, train_VI):
    filled_predictions = test_data.copy()
    
    # Process each test user
    for idx in test_data.index:
        test_user = test_data.iloc[[idx]]
        
        # Get recommendations (similar users) from training set
        similar_users = get_recommendations(test_user, train_data, train_VI)
        
        # Calculate weighted average of similar users' values
        weights = np.exp(-np.array([d[1] for d in similar_users]))  # Convert distances to weights
        weights = weights / np.sum(weights)  # Normalize weights
        
        # Get the profiles of similar users
        similar_profiles = [d[0] for d in similar_users]
        similar_users_data = train_data.loc[similar_profiles]
        
        # Calculate weighted average
        weighted_predictions = np.average(similar_users_data, weights=weights, axis=0)
        
        # Update the test data with weighted predictions
        filled_predictions.iloc[idx, :] = np.round(weighted_predictions)
    
    return filled_predictions


## Load

In [23]:
train = pd.read_csv("../data/train/user_item.csv")
train.head()

Unnamed: 0,profile_id,offer_0,offer_1,offer_2,offer_3,offer_4,offer_5,offer_6,offer_7,offer_8,offer_9
0,86becbd667a94db3a3dee0854470de7c,0,0,0,4,0,0,0,3,6,2
1,676ee3fbf66b46078484ecaa99bc8d1a,4,0,0,0,0,0,0,0,3,4
2,fada060561c24d4a984cc6eba6e2a63a,0,4,0,0,0,0,0,0,0,2
3,5cd3b3a0e5284df1adf8cca2f59ed28f,0,4,12,0,0,0,0,3,0,0
4,8a6a2df8be214007991afb612eb64c1c,0,0,0,0,0,3,3,0,0,0


In [24]:
test = pd.read_csv("../data/test/user_item.csv")
test.head()

Unnamed: 0,profile_id,offer_0,offer_1,offer_2,offer_3,offer_4,offer_5,offer_6,offer_7,offer_8,offer_9
0,b19c8e7ac2ff40ae92b4fcf3247f8912,0,4,4,0,3,0,0,0,0,0
1,0b680efe1a0a40788ebb6fb2c587b4a7,0,0,0,0,3,0,0,3,0,0
2,9232bc9e68744227bdcc537e44d159f7,0,4,0,4,0,0,0,3,3,2
3,6e7d42fc10ee466c80a4056b3ec0b072,0,4,0,4,0,3,0,3,0,0
4,72257b80d8c1407ead2b3af3e7891c25,0,4,0,4,0,0,3,3,3,0


## Transform

In [25]:
# Preprocess data
train_processed, label_encoders, scaler = preprocess_data(train)
test_processed, _, _ = preprocess_data(test)

## Train

In [None]:
# Calculate similarity matrix
similarity_matrix = calculate_mahalanobis_similarity(train_processed)

## Predict

In [None]:
# Make predictions
predictions = fill_test_values(test_processed, train_processed, similarity_matrix)

## Save

In [None]:
# Display first few results
print("Original test data:")
print(test_data.head())
print("\nFilled test data:")
print(filled_test.head())