In [1]:
import numpy as np 
import pandas as pd 
import random
from timeit import default_timer as timer

In [2]:
dtype_list = {'ind_cco_fin_ult1': 'uint8', 'ind_deme_fin_ult1': 'uint8',
            'ind_aval_fin_ult1': 'uint8', 'ind_valo_fin_ult1': 'uint8',
            'ind_reca_fin_ult1': 'uint8', 'ind_ctju_fin_ult1': 'uint8',
            'ind_cder_fin_ult1': 'uint8', 'ind_plan_fin_ult1': 'uint8',
            'ind_fond_fin_ult1': 'uint8', 'ind_hip_fin_ult1': 'uint8',
            'ind_pres_fin_ult1': 'uint8', 'ind_nomina_ult1': 'Int64', 
            'ind_cno_fin_ult1': 'uint8', 'ind_ctpp_fin_ult1': 'uint8',
            'ind_ahor_fin_ult1': 'uint8', 'ind_dela_fin_ult1': 'uint8',
            'ind_ecue_fin_ult1': 'uint8', 'ind_nom_pens_ult1': 'Int64',
            'ind_recibo_ult1': 'uint8', 'ind_deco_fin_ult1': 'uint8',
            'ind_tjcr_fin_ult1': 'uint8', 'ind_ctop_fin_ult1': 'uint8',
            'ind_viv_fin_ult1': 'uint8', 'ind_ctma_fin_ult1': 'uint8',
            'ncodpers' : 'uint32'} 

name_col = ['ncodpers', 'fecha_dato', 'ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1',
               'ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1',
               'ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1',
               'ind_deco_fin_ult1','ind_deme_fin_ult1','ind_dela_fin_ult1',
               'ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1',
               'ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1',
               'ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1',
               'ind_nomina_ult1','ind_nom_pens_ult1','ind_recibo_ult1']

In [3]:
start = timer()
# read a large csv file using chunks with specified dtype and usecols parameters to optimize memory usage
reader = pd.read_csv('./df_train_small.csv.zip', chunksize=1e6,
                     dtype=dtype_list, usecols=name_col)

# concatenate the data from all the chunks into a single dataframe
df_train = pd.concat([chunk for chunk in reader])
df_train.shape

(13647309, 26)

In [4]:
df_train.columns

Index(['fecha_dato', 'ncodpers', 'ind_ahor_fin_ult1', 'ind_aval_fin_ult1',
       'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1',
       'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1',
       'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1',
       'ind_dela_fin_ult1', 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1',
       'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 'ind_pres_fin_ult1',
       'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1',
       'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1',
       'ind_recibo_ult1'],
      dtype='object')

In [5]:
df_train.fecha_dato.unique()

array(['2015-01-28', '2015-02-28', '2015-03-28', '2015-04-28',
       '2015-05-28', '2015-06-28', '2015-07-28', '2015-08-28',
       '2015-09-28', '2015-10-28', '2015-11-28', '2015-12-28',
       '2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28',
       '2016-05-28'], dtype=object)

In [6]:
df_ui = df_train.set_index("ncodpers").drop(columns=["fecha_dato"])


In [7]:
df_ui = df_train.set_index("ncodpers").drop(columns=["fecha_dato"]).dropna()
df_ui.shape


(13631246, 24)

In [8]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

def useritem_knn(user_id, df, n_neighbors=20):
    """
    Recommends products for a user based on similar users using k-NN with cosine similarity.

    Parameters:
    - user_id (int): The ID of the user to recommend products for.
    - df (DataFrame): User-item interaction matrix (users as rows, products as columns).
    - n_neighbors (int): Number of similar users to consider.

    Returns:
    - dict: Dictionary with product probabilities (values between 0 and 1).
    """
    # Ensure no missing values
    df = df.fillna(0)

    # Convert DataFrame index to list for safe lookup
    user_list = df.index.tolist()
    
    if user_id not in user_list:
        raise ValueError("User ID not found in the dataset")

    # Fit k-NN model on user-item matrix
    knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=min(n_neighbors + 1, len(df)))
    knn.fit(df)

    # Ensure correct input format for kneighbors() to avoid feature name warning
    user_vector = df.loc[[user_id]]  # Keep it as a DataFrame to maintain feature names
    distances, indices = knn.kneighbors(user_vector)

    # Get similar users (excluding the user itself)
    similar_users = indices.flatten()[1:]  # Skip the first one (itself)

    if len(similar_users) == 0:
        print(f"⚠️ No similar users found for User {user_id}!")
        return {col: 0 for col in df.columns}  # No similar users, return zero probabilities

    # Compute mean ownership probability for each product
    return df.iloc[similar_users].mean().to_dict()



# 🔹 Example usage
recommendations = useritem_knn(1061608, df=df_ui, n_neighbors=20)
print(recommendations)


{'ind_ahor_fin_ult1': 0.0, 'ind_aval_fin_ult1': 0.0, 'ind_cco_fin_ult1': 0.9662921348314607, 'ind_cder_fin_ult1': 0.0, 'ind_cno_fin_ult1': 0.2443820224719101, 'ind_ctju_fin_ult1': 0.0, 'ind_ctma_fin_ult1': 0.0, 'ind_ctop_fin_ult1': 0.0056179775280898875, 'ind_ctpp_fin_ult1': 0.0056179775280898875, 'ind_deco_fin_ult1': 0.0, 'ind_deme_fin_ult1': 0.0, 'ind_dela_fin_ult1': 0.011235955056179775, 'ind_ecue_fin_ult1': 0.1348314606741573, 'ind_fond_fin_ult1': 0.0028089887640449437, 'ind_hip_fin_ult1': 0.0, 'ind_plan_fin_ult1': 0.0028089887640449437, 'ind_pres_fin_ult1': 0.0, 'ind_reca_fin_ult1': 0.0056179775280898875, 'ind_tjcr_fin_ult1': 0.0056179775280898875, 'ind_valo_fin_ult1': 0.011235955056179775, 'ind_viv_fin_ult1': 0.0, 'ind_nomina_ult1': 0.17696629213483145, 'ind_nom_pens_ult1': 0.1797752808988764, 'ind_recibo_ult1': 0.19101123595505617}


In [9]:
recommendations

{'ind_ahor_fin_ult1': 0.0,
 'ind_aval_fin_ult1': 0.0,
 'ind_cco_fin_ult1': 0.9662921348314607,
 'ind_cder_fin_ult1': 0.0,
 'ind_cno_fin_ult1': 0.2443820224719101,
 'ind_ctju_fin_ult1': 0.0,
 'ind_ctma_fin_ult1': 0.0,
 'ind_ctop_fin_ult1': 0.0056179775280898875,
 'ind_ctpp_fin_ult1': 0.0056179775280898875,
 'ind_deco_fin_ult1': 0.0,
 'ind_deme_fin_ult1': 0.0,
 'ind_dela_fin_ult1': 0.011235955056179775,
 'ind_ecue_fin_ult1': 0.1348314606741573,
 'ind_fond_fin_ult1': 0.0028089887640449437,
 'ind_hip_fin_ult1': 0.0,
 'ind_plan_fin_ult1': 0.0028089887640449437,
 'ind_pres_fin_ult1': 0.0,
 'ind_reca_fin_ult1': 0.0056179775280898875,
 'ind_tjcr_fin_ult1': 0.0056179775280898875,
 'ind_valo_fin_ult1': 0.011235955056179775,
 'ind_viv_fin_ult1': 0.0,
 'ind_nomina_ult1': 0.17696629213483145,
 'ind_nom_pens_ult1': 0.1797752808988764,
 'ind_recibo_ult1': 0.19101123595505617}