In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import bottleneck as bn
import torch

import warnings


# 데이터 불러오기

In [2]:
problem_data = pd.read_csv("problem_processed.csv")
user_data = pd.read_csv("user_data_with_problem_for_EASE.csv")

# 모델 정의

In [12]:
class EASE:
    """
    Embarrassingly Shallow Autoencoders model class
    """

    def __init__(self, lambda_):
        self.B = None
        self.lambda_ = lambda_

    def train(self, interaction_matrix):
        """
        train pass
        :param interaction_matrix: interaction_matrix
        """
        G = interaction_matrix.T @ interaction_matrix
        diag = list(range(G.shape[0]))
        G[diag, diag] += self.lambda_
        P = np.linalg.inv(G)

        # B = P * (X^T * X − diagMat(γ))
        self.B = P / -np.diag(P)
        min_dim = min(*self.B.shape)
        self.B[range(min_dim), range(min_dim)] = 0

    def forward(self, user_row):
        """
        forward pass
        """
        return user_row @ self.B

In [3]:

class EASE:
    """
    Embarrassingly Shallow Autoencoders model class
    """

    def __init__(self, lambda_):
        """
        Initialize EASE model.

        :param lambda_: Regularization parameter.
        """
        self.B = None
        self.lambda_ = lambda_

    def train(self, interaction_matrix):
        """
        Train the EASE model.

        :param interaction_matrix: Interaction matrix.
        """
        G = interaction_matrix.T @ interaction_matrix
        diag = np.diag_indices(G.shape[0])
        G[diag] += self.lambda_
        P = np.linalg.inv(G)

        self.B = P / -np.diag(P)
        min_dim = min(*self.B.shape)
        self.B[range(min_dim), range(min_dim)] = 0

    def forward(self, user_row):
        """
        Perform forward pass.

        :param user_row: User row (1-D array).
        :return: Result of forward pass (1-D array).
        """
        return np.dot(user_row.reshape(1, -1), self.B)

# 데이터 처리

In [4]:
def get_problems_to_idx(problems):
    problems = eval(problems)
    ret = []
    for problemId in problems:
        try: ret.append(int(problemId))
        except: continue
    return ret

In [5]:
user_data['correct_problems'] = user_data['correct_problems'].apply(lambda x : get_problems_to_idx(x))

In [6]:
df_user_problems = user_data[['Unnamed: 0', 'correct_problems']]
df_user_problems = df_user_problems.explode('correct_problems').reset_index(drop=True)
df_user_problems = df_user_problems.dropna(axis=0)

In [7]:
df_user_problems.rename(columns={"Unnamed: 0":"Id_idx"},inplace = True)
df_user_problems.rename(columns={"correct_problems":"correct_problems_idx"},inplace = True)
df_user_problems

Unnamed: 0,Id_idx,correct_problems_idx
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
13844578,121182,3804
13844579,121182,3831
13844580,121182,4869
13844581,121182,5096


In [8]:
df_user_problems['solve'] = [1] * len(df_user_problems)
pivot_table = df_user_problems.pivot_table(index=["Id_idx"], columns=["correct_problems_idx"], values="solve")
X = pivot_table.to_numpy()
X = np.nan_to_num(X)

In [9]:
pivot_table

correct_problems_idx,0,1,2,3,4,5,6,7,8,9,...,7620,7621,7622,7623,7624,7625,7626,7627,7628,7629
Id_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,,1.0,,,,,1.0,
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
2,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,...,,,,,,,,1.0,,
3,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,...,,,,,1.0,,,,,
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121178,1.0,1.0,,,,,,,1.0,,...,,,,,,,,,,
121179,1.0,1.0,,,,,,,1.0,,...,,,,,,,,,,
121180,1.0,1.0,,,,,,,1.0,,...,,,,,,,,,,
121181,,,,1.0,,,,,,,...,,,,,,,,,,


In [13]:
ease = EASE(300)
ease.train(X)

In [14]:
result = ease.forward(X[:, :])
print(result)

[[ 1.03212826e+00  1.01261054e+00  9.68985618e-01 ...  1.33986035e-01
   7.57900103e-01  6.83520038e-02]
 [ 9.94408434e-01  9.80739846e-01  9.98117603e-01 ...  6.72163317e-02
   6.71689719e-02 -6.47086218e-03]
 [ 1.01641708e+00  1.05040422e+00  1.01058182e+00 ...  6.62409061e-01
   4.41536363e-02 -1.45808688e-02]
 ...
 [ 1.06840992e+00  9.44327409e-01 -4.18531575e-02 ...  2.66484727e-03
   1.60465834e-04  2.72632640e-04]
 [ 3.80383817e-01 -1.52932402e-02  4.98642546e-01 ...  1.55543435e-03
   3.02243806e-04 -6.60166277e-05]
 [ 9.88121251e-01  1.01890706e+00  1.90507290e-02 ... -1.68613784e-03
  -1.53161106e-04  5.30582384e-04]]


In [None]:
result[X.nonzero()] = -np.inf
print(result)

In [None]:
NUM_TOP_PROBLEMS = 10
top_problems_by_user = bn.argpartition(-result, NUM_TOP_PROBLEMS, axis=1)[:, :NUM_TOP_PROBLEMS]
print(top_problems_by_user)

In [None]:
user_result = []
problem_result =[]

for id, top_k in enumerate(top_problems_by_user):
    user_result.extend([id] * NUM_TOP_PROBLEMS)
    problem_result.extend(top_k)

df_user_result = pd.DataFrame(user_result, columns=['user_id'])
df_problem_result = pd.DataFrame(problem_result, columns=['problem_id'])
df_result = pd.concat([df_user_result, df_problem_result], axis=1)

In [None]:
df_result.to_csv("EASE_result.csv")

# 평가?