In [5]:
import numpy as np
from scipy.io import loadmat
import pandas as pd 
from time import time


### 1. Loading Engagement Matrix

This function loads the engagement matrix LL from a .mat file and handles missing values.

In [17]:
def load_engagement_matrix(file_path):
    """
    Load the engagement matrix L from the MATLAB .mat file using the key 'train'.
    """
    data = loadmat(file_path)
    if 'train' in data:
        L = data['train']
    else:
        raise KeyError("The matrix 'train' is not found in the .mat file.")
    L = np.array(L, dtype=float)
    L[L == 0] = np.nan  # Replace 0s with NaN to handle missing values
    return L

In [7]:
def initialize_embeddings_xavier(num_researchers, num_papers, embedding_dim, seed=42):
    """
    Initialize embeddings for researchers and papers using Xavier initialization.
    """
    np.random.seed(seed)
    limit = np.sqrt(6 / (2 * embedding_dim))
    researcher_vecs = np.random.uniform(-limit, limit, (num_researchers, embedding_dim))
    papers_vecs = np.random.uniform(-limit, limit, (num_papers, embedding_dim))
    return researcher_vecs, papers_vecs


### 2. Initializing Embeddings

Randomly initializes embeddings for researchers and papers.

In [22]:
def initialize_embeddings(num_researchers, num_papers, embedding_dim, seed=42):
    """
    Initialize embeddings for researchers and papers.
    """
    np.random.seed(seed)
    researcher_vecs = np.random.normal(scale=0.1, size=(num_researchers, embedding_dim))
    papers_vecs = np.random.normal(scale=0.1, size=(num_papers, embedding_dim))
    return researcher_vecs, papers_vecs


### 3. Updating Embeddings
##### 3.1 Researcher Embeddings

In [8]:
def update_researcher_embeddings(L, researcher_vecs, papers_vecs, lambda_reg):
    """
    Update researcher embeddings (Z) given paper embeddings (W).
    """
    num_researchers, embedding_dim = researcher_vecs.shape
    for i in range(num_researchers):
        indices = ~np.isnan(L[i, :])
        if np.sum(indices) > 0:
            W_obs = papers_vecs[indices]
            L_obs = L[i, indices]
            A = W_obs.T @ W_obs + lambda_reg * np.eye(embedding_dim)
            b = W_obs.T @ L_obs
            researcher_vecs[i] = np.linalg.solve(A, b)
    return researcher_vecs

####  3.2 Paper Embeddings

In [9]:
def update_paper_embeddings(L, researcher_vecs, papers_vecs, lambda_reg):
    """
    Update paper embeddings (W) given researcher embeddings (Z).
    """
    num_papers, embedding_dim = papers_vecs.shape
    for j in range(num_papers):
        indices = ~np.isnan(L[:, j])
        if np.sum(indices) > 0:
            Z_obs = researcher_vecs[indices]
            L_obs = L[indices, j]
            A = Z_obs.T @ Z_obs + lambda_reg * np.eye(embedding_dim)
            b = Z_obs.T @ L_obs
            papers_vecs[j] = np.linalg.solve(A, b)
    return papers_vecs


### 4. Training Loop

In [41]:
def train_matrix_factorization(L, embedding_dim, lambda_reg, num_iterations):
    """
    Train matrix factorization using alternating minimization.
    """
    num_researchers, num_papers = L.shape
    researcher_vecs, papers_vecs = initialize_embeddings(num_researchers, num_papers, embedding_dim)

    for iteration in range(num_iterations):
        researcher_vecs = update_researcher_embeddings(L, researcher_vecs, papers_vecs, lambda_reg)
        papers_vecs = update_paper_embeddings(L, researcher_vecs, papers_vecs, lambda_reg)
        mse_loss = get_train_mse(L, researcher_vecs, papers_vecs)
        print(f"Iteration {iteration + 1}/{num_iterations}, MSE Loss: {mse_loss:.4f}")

    return researcher_vecs, papers_vecs


In [92]:
def train_matrix_factorization(L, embedding_dim, lambda_reg, num_iterations):
    """
    Train matrix factorization using alternating minimization with Xavier initialization.
    """
    num_researchers, num_papers = L.shape
    researcher_vecs, papers_vecs = initialize_embeddings_xavier(num_researchers, num_papers, embedding_dim)
    
    for iteration in range(num_iterations):
        researcher_vecs = update_researcher_embeddings(L, researcher_vecs, papers_vecs, lambda_reg)
        papers_vecs = update_paper_embeddings(L, researcher_vecs, papers_vecs, lambda_reg)
        mse_loss = get_train_mse(L, researcher_vecs, papers_vecs)
        print(f"Iteration {iteration + 1}/{num_iterations}, MSE Loss: {mse_loss:.4f}")
    
    return researcher_vecs, papers_vecs


### 5. Evaluation Metrics
##### Mean Squared Error (MSE)

In [10]:
def get_train_mse(L, researcher_vecs, papers_vecs):
    """
    Calculate the Mean Squared Error (MSE) loss on the training data.
    """
    mse_loss = 0    
    for i in range(L.shape[0]):
        for j in range(L.shape[1]):
            if not np.isnan(L[i, j]):
                mse_loss += (np.dot(researcher_vecs[i], papers_vecs[j]) - L[i, j])**2
    return mse_loss / np.sum(~np.isnan(L))


###  Training Accuracy

In [11]:
def get_train_acc(L, researcher_vecs, papers_vecs):
    """
    Calculate training accuracy using a positive/negative metric.
    """
    num_correct, total = 0, 0
    for i in range(L.shape[0]):
        for j in range(L.shape[1]):
            if not np.isnan(L[i, j]):
                total += 1
                approx = np.dot(researcher_vecs[i], papers_vecs[j])
                if approx * L[i, j] > 0:
                    num_correct += 1             
    return num_correct / total


test test 

### Usage

In [42]:
# Load the engagement matrix L
file_path = 'papers_train.mat'  # Replace with the actual file path
L = load_engagement_matrix(file_path)
print(f"Loaded engagement matrix L with shape {L.shape}")

# Train Matrix Factorization
embedding_dim = 10  # Dimensionality of embeddings
lambda_reg = 0.1    # Regularization strength
num_iterations = 20 # Number of training iterations

researcher_vecs, papers_vecs = train_matrix_factorization(L, embedding_dim, lambda_reg, num_iterations)

# Evaluate the Model
mse_loss = get_train_mse(L, researcher_vecs, papers_vecs)
train_acc = get_train_acc(L, researcher_vecs, papers_vecs)
print(f"Final Training MSE: {mse_loss:.4f}")
print(f"Final Training Accuracy: {train_acc:.4f}")

Loaded engagement matrix L with shape (24983, 100)
Iteration 1/20, MSE Loss: 18.4077
Iteration 2/20, MSE Loss: 12.7469
Iteration 3/20, MSE Loss: 11.6793
Iteration 4/20, MSE Loss: 11.1750
Iteration 5/20, MSE Loss: 10.9049
Iteration 6/20, MSE Loss: 10.7517
Iteration 7/20, MSE Loss: 10.6542
Iteration 8/20, MSE Loss: 10.5847
Iteration 9/20, MSE Loss: 10.5307
Iteration 10/20, MSE Loss: 10.4866
Iteration 11/20, MSE Loss: 10.4495
Iteration 12/20, MSE Loss: 10.4180
Iteration 13/20, MSE Loss: 10.3913
Iteration 14/20, MSE Loss: 10.3689
Iteration 15/20, MSE Loss: 10.3505
Iteration 16/20, MSE Loss: 10.3354
Iteration 17/20, MSE Loss: 10.3232
Iteration 18/20, MSE Loss: 10.3132
Iteration 19/20, MSE Loss: 10.3051
Iteration 20/20, MSE Loss: 10.2983
Final Training MSE: 10.2983
Final Training Accuracy: 0.8189


In [102]:
# Impute missing values in the engagement matrix L
def impute_matrix(L):
    """
    Impute missing values (NaN) in the matrix L using the row mean.
    """
    num_researchers = L.shape[0]
    for i in range(num_researchers):
        row = L[i, :]
        row_mean = np.nanmean(row)
        row[np.isnan(row)] = row_mean
    return L
L_imputed = impute_matrix(L) 

### 1. Updating Embeddings
##### 1.1 Researcher Embeddings

In [13]:
def update_researcher_embeddings(L, researcher_vecs, papers_vecs, lambda_reg):
    """
    Update researcher embeddings (Z) given paper embeddings (W).
    """
    num_researchers, embedding_dim = researcher_vecs.shape
    for i in range(num_researchers):
        indices = ~np.isnan(L[i, :])  # Observed indices for researcher i
        if np.sum(indices) > 0:
            W_obs = papers_vecs[indices]  # Paper embeddings for observed entries
            L_obs = L[i, indices]  # Observed engagement values
            A = W_obs.T @ W_obs + lambda_reg * np.eye(embedding_dim)
            b = W_obs.T @ L_obs
            researcher_vecs[i] = np.linalg.solve(A, b)
    return researcher_vecs


##### 1.2 Paper Embeddings

In [14]:
def update_paper_embeddings(L, researcher_vecs, papers_vecs, lambda_reg):
    """
    Update paper embeddings (W) given researcher embeddings (Z).
    """
    num_papers, embedding_dim = papers_vecs.shape
    for j in range(num_papers):
        indices = ~np.isnan(L[:, j])  # Observed indices for paper j
        if np.sum(indices) > 0:
            Z_obs = researcher_vecs[indices]  # Researcher embeddings for observed entries
            L_obs = L[indices, j]  # Observed engagement values
            A = Z_obs.T @ Z_obs + lambda_reg * np.eye(embedding_dim)
            b = Z_obs.T @ L_obs
            papers_vecs[j] = np.linalg.solve(A, b)
    return papers_vecs


###  2. Training Loop

In [52]:
def train_matrix_factorization2(L, embedding_dim, lambda_reg, num_iterations):
    """
    Train matrix factorization using alternating minimization.
    """
    num_researchers, num_papers = L.shape
    researcher_vecs, papers_vecs = initialize_embeddings_xavier(num_researchers, num_papers, embedding_dim)
    
    start_time = time()
    for iteration in range(num_iterations):
        researcher_vecs = update_researcher_embeddings(L, researcher_vecs, papers_vecs, lambda_reg)
        papers_vecs = update_paper_embeddings(L, researcher_vecs, papers_vecs, lambda_reg)
        mse_loss = get_train_mse(L, researcher_vecs, papers_vecs)
        print(f"Iteration {iteration + 1}/{num_iterations}, MSE Loss: {mse_loss:.4f}")
    
    total_time = time() - start_time
    print(f"Training completed in {total_time:.2f} seconds")
    return researcher_vecs, papers_vecs

#def train_matrix_factorization(L, embedding_dim, lambda_reg, num_iterations):
#    """
#    Train matrix factorization using alternating minimization with Xavier initialization.
#    """
#    num_researchers, num_papers = L.shape
#    # Use Xavier initialization
#    researcher_vecs, papers_vecs = initialize_embeddings(num_researchers, num_papers, embedding_dim)
#    
#    for iteration in range(num_iterations):
#        researcher_vecs = update_researcher_embeddings(L, researcher_vecs, papers_vecs, lambda_reg)
#        papers_vecs = update_paper_embeddings(L, researcher_vecs, papers_vecs, lambda_reg)
#        mse_loss = get_train_acc(L, researcher_vecs, papers_vecs)
#        print(f"Iteration {iteration + 1}/{num_iterations}, MSE Loss: {mse_loss:.4f}")
#    
#    return researcher_vecs, papers_vecs

#def get_weighted_train_mse(L, researcher_vecs, papers_vecs):
#    """
#    Calculate the Weighted Mean Squared Error (MSE) loss on the training data.
#    """
#    mse_loss, total_weight = 0, 0
#    for i in range(L.shape[0]):
#        for j in range(L.shape[1]):
#            if not np.isnan(L[i, j]):
#                weight = 1 + abs(L[i, j])  # Higher weight for larger engagement values
#                total_weight += weight
#                mse_loss += weight * (np.dot(researcher_vecs[i], papers_vecs[j]) - L[i, j])**2
#    return mse_loss / total_weight
#

#def train_matrix_factorization_with_weighted_mse(L, embedding_dim, lambda_reg, num_iterations):
#    """
#    Train matrix factorization using alternating minimization with Weighted MSE.
#    """
#    num_researchers, num_papers = L.shape
#    researcher_vecs, papers_vecs = initialize_embeddings_xavier(num_researchers, num_papers, embedding_dim)
#
#    for iteration in range(num_iterations):
#        # Update researcher and paper embeddings
#        researcher_vecs = update_researcher_embeddings(L, researcher_vecs, papers_vecs, lambda_reg)
#        papers_vecs = update_paper_embeddings(L, researcher_vecs, papers_vecs, lambda_reg)
#        
#        # Calculate weighted MSE
#        weighted_mse_loss = get_weighted_train_mse(L, researcher_vecs, papers_vecs)
#        print(f"Iteration {iteration + 1}/{num_iterations}, Weighted MSE Loss: {weighted_mse_loss:.4f}")
#    
#    return researcher_vecs, papers_vecs




In [44]:
def impute_matrix(L):
    """
    Impute missing values (NaN) in the matrix L using the row mean.
    """
    num_researchers = L.shape[0]
    for i in range(num_researchers):
        row = L[i, :]
        row_mean = np.nanmean(row)
        row[np.isnan(row)] = row_mean
    return L

### 3. Training and Evaluation

In [53]:
# Train and evaluate
embedding_dim = 10
lambda_reg = 0.1
num_iterations = 20
L = load_engagement_matrix('papers_train.mat')
L1 = impute_matrix(L)
researcher_vecs, papers_vecs = train_matrix_factorization2(L1, embedding_dim, lambda_reg, num_iterations)
train_acc = get_train_acc(L1, researcher_vecs, papers_vecs)
print(f"Final Training Accuracy: {train_acc:.4f}")

Iteration 1/20, MSE Loss: 7.2724
Iteration 2/20, MSE Loss: 5.9310
Iteration 3/20, MSE Loss: 5.7745
Iteration 4/20, MSE Loss: 5.6953
Iteration 5/20, MSE Loss: 5.6588
Iteration 6/20, MSE Loss: 5.6415
Iteration 7/20, MSE Loss: 5.6323
Iteration 8/20, MSE Loss: 5.6268
Iteration 9/20, MSE Loss: 5.6233
Iteration 10/20, MSE Loss: 5.6207
Iteration 11/20, MSE Loss: 5.6186
Iteration 12/20, MSE Loss: 5.6169
Iteration 13/20, MSE Loss: 5.6154
Iteration 14/20, MSE Loss: 5.6141
Iteration 15/20, MSE Loss: 5.6129
Iteration 16/20, MSE Loss: 5.6118
Iteration 17/20, MSE Loss: 5.6108
Iteration 18/20, MSE Loss: 5.6099
Iteration 19/20, MSE Loss: 5.6091
Iteration 20/20, MSE Loss: 5.6084
Training completed in 224.47 seconds
Final Training Accuracy: 0.8634


### Generating Predictions for Researcher-Paper Pairs

####  1. Loading the Test Set

In [54]:
def load_test_set_from_txt(file_path):
    """
    Load the test set from a text file with researcher-paper pairs.
    """
    test_data = pd.read_csv(file_path, header=None, names=["Researcher", "Paper"])
    test_data["ID"] = range(1, len(test_data) + 1)  # Add unique IDs for each row
    return test_data[["ID", "Researcher", "Paper"]]


#### 2. Generating Predictions

In [55]:
def generate_predictions(test_data, researcher_vecs, papers_vecs):
    """
    Generate predictions for the test set.
    """
    predictions = []
    for _, row in test_data.iterrows():
        researcher_id = int(row["Researcher"]) - 1  # Adjusting for 0-based indexing
        paper_id = int(row["Paper"]) - 1           # Adjusting for 0-based indexing
        dot_product = np.dot(researcher_vecs[researcher_id], papers_vecs[paper_id])
        predicted_label = 1 if dot_product > 0 else -1
        predictions.append(predicted_label)
    return predictions


#### Saving Predictions

In [56]:
def save_predictions_to_csv(test_data, predictions, output_file):
    """
    Save predictions to a CSV file in the Kaggle-required format.
    """
    test_data["Value"] = predictions
    submission = test_data[["ID", "Value"]]
    submission.to_csv(output_file, index=False, header=True)

#### Steps to Execute

In [57]:
# Step 1: Load the test set
test_file = 'papers_test.txt'  # Replace with the actual test file path
test_data = load_test_set_from_txt(test_file)


# Step 2: Generate predictions
predictions = generate_predictions(test_data, researcher_vecs, papers_vecs)

# Step 3: Save to CSV
output_file = 'done.csv'  # Desired output file name
save_predictions_to_csv(test_data, predictions, output_file)

print(f"Predictions saved to {output_file}")

Predictions saved to done.csv
