In [13]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix

# https://www.mikulskibartosz.name/how-to-reduce-memory-usage-in-pandas/
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
    if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                    df[col] = df[col].astype(np.uint16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                    df[col] = df[col].astype(np.uint32)                    
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                    df[col] = df[col].astype(np.uint64)
            elif str(col_type)[:5] == 'float':
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

df = pd.read_csv('./RAW_interactions.csv')
df = df.drop('review', axis=1)
data_df = reduce_mem_usage(df)

# https://stackoverflow.com/questions/49137031/pandas-dataframe-delete-rows-with-low-frequency
# data_df_fix = data_df[data_df.groupby('user_id')['user_id'].transform('count').ge(20)]
# data_df_fixed = data_df_fix[data_df.groupby('recipe_id')['recipe_id'].transform('count').ge(20)]

# https://stackoverflow.com/questions/32511061/remove-low-frequency-values-from-pandas-dataframe
columns = ['user_id', 'recipe_id']
threshold = 20 # Anything that occurs less than this will be removed.
for col in columns:
    value_counts = data_df[col].value_counts() # Specific column 
    to_remove = value_counts[value_counts < threshold].index
    data_df[col].replace(to_remove, np.nan, inplace=True)


# frequency count of column rating
count_u = data_df_fixed['user_id'].value_counts()
print(count_u)

# # data_df = data_df[0:40000]
print(data_df)

Memory usage of dataframe is 34.56 MB
Memory usage after optimization is: 27.00 MB
Decreased by 21.9%


MemoryError: Unable to allocate 1.08 MiB for an array with shape (1132367,) and data type bool

In [None]:
# First, generate dictionaries for mapping old id to new id for users and movies
unique_RecipeID = data_df['recipe_id'].unique()
print(len(unique_RecipeID))
unique_UserID = data_df['user_id'].unique()
print(len(unique_UserID))
j = 0
user_old2new_id_dict = dict()
for u in unique_UserID:
    user_old2new_id_dict[u] = j
    j += 1
j = 0
recipe_old2new_id_dict = dict()
for i in unique_RecipeID:
    recipe_old2new_id_dict[i] = j
    j += 1
    
# Then, use the generated dictionaries to reindex UserID and MovieID in the data_df
user_list = data_df['user_id'].values
recipe_list = data_df['recipe_id'].values
for j in range(len(data_df)):
    user_list[j] = user_old2new_id_dict[user_list[j]]
    recipe_list[j] = recipe_old2new_id_dict[recipe_list[j]]
data_df['user_id'] = user_list
data_df['recipe_id'] = recipe_list

# generate train_df with 70% samples and test_df with 30% samples, and there should have no overlap between them.
train_index = np.random.random(len(data_df)) <= 0.7
train_df = data_df[train_index]
test_df = data_df[~train_index]

# generate train_mat and test_mat
num_user = len(data_df['user_id'].unique())
num_recipe = len(data_df['recipe_id'].unique())

train_mat = coo_matrix((train_df['rating'].values, (train_df['user_id'].values, train_df['recipe_id'].values)), shape=(num_user, num_recipe)).astype(float).toarray()
test_mat = coo_matrix((test_df['rating'].values, (test_df['user_id'].values, test_df['recipe_id'].values)), shape=(num_user, num_recipe)).astype(float).toarray()

print(train_mat.shape)
print(test_mat.shape)

In [None]:
# implement your improved model and print out the RMSE
# item-item cf
# Your Code Here...

indicator_mat = (train_mat > 0).astype(float)
num_rating_items = np.sum(indicator_mat, axis=0, keepdims=True)
numer = np.matmul(indicator_mat.T, indicator_mat)  # num_item * num_item
denom = num_rating_items.T + num_rating_items - numer  # num_item * num_item
denom[denom == 0] = 1
J = numer / denom  # num_item * num_item

prediction_mat = train_mat.copy()

print(prediction_mat)

num_rating_items[num_rating_items == 0] = 1
mu_items = np.sum(train_mat, axis=0, keepdims=True) / num_rating_items  # 1 * num_item
deviation_mat = (train_mat - mu_items) * indicator_mat
for i in range(num_recipe):
    similarities = J[i, :]
    similarities[i] = -1
    N_idx = np.argpartition(similarities, -10)[-10:]
    N_sim = similarities[N_idx]
    prediction_mat[:, i] = np.sum(N_sim.reshape((1, -1)) * deviation_mat[:, N_idx], axis=1) / (np.sum(N_sim) + 1e-10)
prediction_mat += mu_items

indicator_mat = (test_mat > 0).astype(float)
test_rmse = (np.sum(((prediction_mat - test_mat) * indicator_mat) ** 2) / np.sum(indicator_mat)) ** 0.5
print(test_rmse)