In [1]:
import warnings
import numpy as np
import pandas as pd
from scipy.sparse import lil_matrix

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [2]:
# Fijamos el directorio de trabajo
DIRECTORY =  "/home/spperez/masiv_model/"

In [3]:
# Datos
df = pd.read_csv(DIRECTORY + "data/interim/clean_data.csv")
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,OrderValue,Month
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01,2.55,17850.0,United Kingdom,15.3,2010-12
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01,3.39,17850.0,United Kingdom,20.34,2010-12
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01,2.75,17850.0,United Kingdom,22.0,2010-12
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01,3.39,17850.0,United Kingdom,20.34,2010-12
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01,3.39,17850.0,United Kingdom,20.34,2010-12


In [4]:
# Se selecionan solo lo necesario
data = df[["CustomerID", "StockCode", "Quantity"]].drop_duplicates().reset_index(drop=True)
data

Unnamed: 0,CustomerID,StockCode,Quantity
0,17850.0,85123A,6
1,17850.0,71053,6
2,17850.0,84406B,8
3,17850.0,84029G,6
4,17850.0,84029E,6
...,...,...,...
318548,13113.0,22061,8
318549,12680.0,22631,12
318550,12680.0,22613,12
318551,12680.0,23255,4


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318553 entries, 0 to 318552
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   CustomerID  318553 non-null  float64
 1   StockCode   318553 non-null  object 
 2   Quantity    318553 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 7.3+ MB


In [6]:
# Creando un id unico para el producto
df_plu = pd.DataFrame({"StockCode": data["StockCode"].unique(), "ProductID": range(1, 3666)})
df_plu

Unnamed: 0,StockCode,ProductID
0,85123A,1
1,71053,2
2,84406B,3
3,84029G,4
4,84029E,5
...,...,...
3660,90214U,3661
3661,90214W,3662
3662,90214Z,3663
3663,90089,3664


In [7]:
# unimos las dos bd
data = pd.merge(data, df_plu, how="left", on="StockCode")
data

Unnamed: 0,CustomerID,StockCode,Quantity,ProductID
0,17850.0,85123A,6,1
1,17850.0,71053,6,2
2,17850.0,84406B,8,3
3,17850.0,84029G,6,4
4,17850.0,84029E,6,5
...,...,...,...,...
318548,13113.0,22061,8,1287
318549,12680.0,22631,12,38
318550,12680.0,22613,12,1837
318551,12680.0,23255,4,2895


In [8]:
data_name = df[["StockCode", "Description"]].drop_duplicates().reset_index(drop=True)
data_name

Unnamed: 0,StockCode,Description
0,85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,71053,WHITE METAL LANTERN
2,84406B,CREAM CUPID HEARTS COAT HANGER
3,84029G,KNITTED UNION FLAG HOT WATER BOTTLE
4,84029E,RED WOOLLY HOTTIE WHITE HEART.
...,...,...
3892,90214W,"LETTER ""W"" BLING KEY RING"
3893,90214Z,"LETTER ""Z"" BLING KEY RING"
3894,90089,PINK CRYSTAL SKULL PHONE CHARM
3895,85123A,CREAM HANGING HEART T-LIGHT HOLDER


In [9]:
def PurchaseMatrix(datos):
    """
    Crea la matriz de incidencia,
    donde 1 indica si el cliente ha comprado el producto j,
    0 en otro caso

    Parametros
    --------
    datos: Data frame
           Data frame tiene las columnas CustomerID, ProductID y Quantity
    Retorna
    ------
    Mincidencia : ndarray
                 Matriz de incidencia.
    list_ProductID : ndarray
                 Lista con los ProductID
    """
        
    CustomerID = datos[["CustomerID"]].drop_duplicates()
    CustomerID.sort_values(inplace=True, by="CustomerID")
    CustomerID.reset_index(inplace=True)
    CustomerID['index_j'] = np.arange(CustomerID.shape[0])


    ProductID = datos[["ProductID"]].drop_duplicates()
    ProductID.sort_values(inplace=True, by="ProductID")
    ProductID.reset_index(inplace=True)
    ProductID['index_i'] = np.arange(ProductID.shape[0])

    datos = datos[["CustomerID", "ProductID"]].merge(CustomerID, on="CustomerID")
    datos = datos.merge(ProductID, on="ProductID")
    Mincidencia = lil_matrix((ProductID.shape[0], CustomerID.shape[0]))

    Mincidencia[datos['index_i'], datos['index_j']] = 1
    Mincidencia_csr = Mincidencia.tocsr()
    sumCol = np.array(Mincidencia_csr.sum(axis=0) >= 2).ravel()
    Mincidencia = Mincidencia[:, sumCol]


    list_ProductID = ProductID["ProductID"].to_numpy()

    return Mincidencia, list_ProductID

In [10]:
M_incidencia, ProductID = PurchaseMatrix(data)

In [11]:
M_incidencia

<3665x4247 sparse matrix of type '<class 'numpy.float64'>'
	with 266701 stored elements in List of Lists format>

In [12]:
ProductID

array([   1,    2,    3, ..., 3663, 3664, 3665])

In [13]:
similarities = cosine_similarity(M_incidencia)*-1
similarities

array([[-1.        , -0.22413712, -0.22833992, ..., -0.        ,
        -0.02418254, -0.        ],
       [-0.22413712, -1.        , -0.15628012, ..., -0.        ,
        -0.        , -0.        ],
       [-0.22833992, -0.15628012, -1.        , ..., -0.        ,
        -0.        , -0.        ],
       ...,
       [-0.        , -0.        , -0.        , ..., -1.        ,
        -0.        , -0.        ],
       [-0.02418254, -0.        , -0.        , ..., -0.        ,
        -1.        , -0.        ],
       [-0.        , -0.        , -0.        , ..., -0.        ,
        -0.        , -1.        ]])

In [14]:
similarities.shape

(3665, 3665)

In [15]:
cols = ["ProductID"] + [f"Recomendación_{i}" for i in range(1, 5 + 1)]
cols

['ProductID',
 'Recomendación_1',
 'Recomendación_2',
 'Recomendación_3',
 'Recomendación_4',
 'Recomendación_5']

In [16]:
ProductID = np.array(ProductID)
ProductID

array([   1,    2,    3, ..., 3663, 3664, 3665])

In [17]:
ProductID.shape

(3665,)

In [18]:
top = np.argsort(similarities)[:, :5 + 1]
recomendacion = ProductID[top].squeeze()
recomendacion = pd.DataFrame(data=recomendacion, columns=cols)
recomendacion

Unnamed: 0,ProductID,Recomendación_1,Recomendación_2,Recomendación_3,Recomendación_4,Recomendación_5
0,1,59,257,127,126,55
1,2,551,1,303,123,59
2,3,632,966,2929,795,2944
3,4,5,3377,213,8,177
4,5,4,213,176,214,212
...,...,...,...,...,...,...
3660,3663,3662,3661,3660,3659,3658
3661,3663,3662,3661,3660,3659,3658
3662,3663,3662,3661,3660,3659,3658
3663,3664,3162,3635,2783,3166,3098


In [19]:
# Se fijan los elementas de la diagonal
recomendacion["ProductID"] = ProductID
recomendacion["ProductID"].nunique()

3665

In [20]:
def favorito(datos):
    """
    Halla el Producto favorito segun la cantidad de unidades compradas

    Parametros
    ----------
    datos: Data frame
           Data frame tiene las columnas CustomerID, ProductID y Quantity
    Returns
    ------
    preferido: Data frame
               Retorna un data frame con el producto preferido
               segun las unidades


    """
    # Filtra Und diferentes de cero
    datos = datos[datos["Quantity"] != 0].reset_index(drop=True)
    # Por PartyID saca la posicion del maximo numero
    indexdatos = datos.groupby(["CustomerID"]).idxmax()
    preferido = datos.iloc[indexdatos["Quantity"]]
    preferido.drop(columns=["Quantity"], inplace=True)
    
    return preferido


In [21]:
df_favo = favorito(data)

  indexdatos = datos.groupby(["CustomerID"]).idxmax()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preferido.drop(columns=["Quantity"], inplace=True)


In [22]:
df_favo

Unnamed: 0,CustomerID,StockCode,ProductID
34788,12346.0,23166,2550
89565,12347.0,23076,2854
21749,12348.0,21981,697
290905,12349.0,21231,824
45888,12350.0,22348,1464
...,...,...,...
66067,18280.0,22084,1205
135363,18281.0,22037,962
172683,18282.0,23187,2813
156907,18283.0,23077,2853


In [23]:
# Se le pega a cada cliente las recomendaciones segun el prod. favo
recomendacion_cliente = pd.merge(df_favo[["CustomerID","ProductID"]], recomendacion, on="ProductID", how='left')
recomendacion_cliente

Unnamed: 0,CustomerID,ProductID,Recomendación_1,Recomendación_2,Recomendación_3,Recomendación_4,Recomendación_5
0,12346.0,2550,3067,3062,2699,3137,2144
1,12347.0,2854,2853,2855,2865,3164,68
2,12348.0,697,701,696,204,698,700
3,12349.0,824,208,209,822,1292,106
4,12350.0,1464,1095,1977,70,1216,450
...,...,...,...,...,...,...,...
4333,18280.0,1205,1037,96,47,530,168
4334,18281.0,962,961,310,988,546,1640
4335,18282.0,2813,2814,2404,2599,2826,3318
4336,18283.0,2853,2855,2854,3164,964,106


In [24]:
recomendacion_cliente[recomendacion_cliente["ProductID"] == 1975]

Unnamed: 0,CustomerID,ProductID,Recomendación_1,Recomendación_2,Recomendación_3,Recomendación_4,Recomendación_5
712,13295.0,1975,2106,1976,2233,673,1882
4337,18287.0,1975,2106,1976,2233,673,1882


In [25]:
# Diccionario de nombres de productos 
df_dic = pd.merge(data_name, df_plu, on="StockCode", how="inner").drop_duplicates()
df_dic.head(10)

Unnamed: 0,StockCode,Description,ProductID
0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,1
1,85123A,CREAM HANGING HEART T-LIGHT HOLDER,1
2,71053,WHITE METAL LANTERN,2
3,71053,WHITE MOROCCAN METAL LANTERN,2
4,84406B,CREAM CUPID HEARTS COAT HANGER,3
5,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,4
6,84029E,RED WOOLLY HOTTIE WHITE HEART.,5
7,22752,SET 7 BABUSHKA NESTING BOXES,6
8,21730,GLASS STAR FROSTED T-LIGHT HOLDER,7
9,22633,HAND WARMER UNION JACK,8


In [26]:
# Se retiran los nombres duplicados
for i in range(1, df_dic.shape[0]):
    con_1 = df_dic["StockCode"][i] == df_dic["StockCode"][i-1]
    con_2 = df_dic["ProductID"][i] == df_dic["ProductID"][i-1]
    if con_1 & con_2:
        df_dic["Description"][i] = df_dic["Description"][i-1]
    
    i += 1

df_dic.drop_duplicates(inplace=True)
df_dic

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dic["Description"][i] = df_dic["Description"][i-1]


Unnamed: 0,StockCode,Description,ProductID
0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,1
2,71053,WHITE METAL LANTERN,2
4,84406B,CREAM CUPID HEARTS COAT HANGER,3
5,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,4
6,84029E,RED WOOLLY HOTTIE WHITE HEART.,5
...,...,...,...
3892,90214U,"LETTER ""U"" BLING KEY RING",3661
3893,90214W,"LETTER ""W"" BLING KEY RING",3662
3894,90214Z,"LETTER ""Z"" BLING KEY RING",3663
3895,90089,PINK CRYSTAL SKULL PHONE CHARM,3664


In [27]:
df_dic.to_csv(DIRECTORY + "data/interim/product_names.csv", index=False)

In [28]:
recomendacion_cliente.to_csv(DIRECTORY + "data/processed/recomendation.csv", index=False)

In [29]:
recomendacion_cliente

Unnamed: 0,CustomerID,ProductID,Recomendación_1,Recomendación_2,Recomendación_3,Recomendación_4,Recomendación_5
0,12346.0,2550,3067,3062,2699,3137,2144
1,12347.0,2854,2853,2855,2865,3164,68
2,12348.0,697,701,696,204,698,700
3,12349.0,824,208,209,822,1292,106
4,12350.0,1464,1095,1977,70,1216,450
...,...,...,...,...,...,...,...
4333,18280.0,1205,1037,96,47,530,168
4334,18281.0,962,961,310,988,546,1640
4335,18282.0,2813,2814,2404,2599,2826,3318
4336,18283.0,2853,2855,2854,3164,964,106


In [30]:
for i in range(0, recomendacion_cliente.shape[0]):
    for j in range(1, recomendacion_cliente.shape[1]):
        recomendacion_cliente.iloc[i,j] = ((df_dic.where(df_dic["ProductID"]==recomendacion_cliente.iloc[i,j])).dropna(axis=0, how='all')).iloc[0,1]
        j += 1
   
    i += 1
        


In [31]:
recomendacion_cliente

Unnamed: 0,CustomerID,ProductID,Recomendación_1,Recomendación_2,Recomendación_3,Recomendación_4,Recomendación_5
0,12346.0,MEDIUM CERAMIC TOP STORAGE JAR,LARGE CERAMIC TOP STORAGE JAR,SMALL CERAMIC TOP STORAGE JAR,SET OF 4 PANTRY JELLY MOULDS,SET OF 60 PANTRY DESIGN CAKE CASES,SET OF 6 SPICE TINS PANTRY DESIGN
1,12347.0,ICE CREAM SUNDAE LIP GLOSS,DOUGHNUT LIP GLOSS,ICE CREAM PEN LIP GLOSS,KNICKERBOCKERGLORY MAGNET ASSORTED,BUBBLEGUM RING ASSORTED,STRAWBERRY LUNCH BOX WITH CUTLERY
2,12348.0,PACK OF 12 WOODLAND TISSUES,PACK OF 12 SPACEBOY TISSUES,PACK OF 12 SUKI TISSUES,PACK OF 12 RED RETROSPOT TISSUES,PACK OF 12 SKULL TISSUES,PACK OF 12 CIRCUS PARADE TISSUES
3,12349.0,SWEETHEART CERAMIC TRINKET BOX,STRAWBERRY CERAMIC TRINKET BOX,PINK DOUGHNUT TRINKET POT,MINI CAKE STAND WITH HANGING CAKES,STRAWBERRY FAIRY CAKE TEAPOT,CERAMIC STRAWBERRY CAKE MONEY BANK
4,12350.0,TEA BAG PLATE RED RETROSPOT,RED SPOTTY BISCUIT TIN,RED RETROSPOT SMALL MILK JUG,PACK OF 72 RETROSPOT CAKE CASES,RED RETROSPOT BUTTER DISH,RED RETROSPOT OVEN GLOVE
...,...,...,...,...,...,...,...
4333,18280.0,PAPER CHAIN KIT EMPIRE,WOODEN UNION JACK BUNTING,PAPER CHAIN KIT RETROSPOT,PAPER CHAIN KIT 50'S CHRISTMAS,VINTAGE UNION JACK BUNTING,PAPER CHAIN KIT VINTAGE CHRISTMAS
4334,18281.0,ROBOT BIRTHDAY CARD,SPACEBOY BIRTHDAY CARD,CARD CIRCUS PARADE,TEA PARTY BIRTHDAY CARD,CARD DOLLY GIRL,CARD SUKI BIRTHDAY
4335,18282.0,FRENCH STYLE STORAGE JAR BONBONS,FRENCH STYLE STORAGE JAR CAFE,RED POLKADOT BEAKER,TEATIME PUSH DOWN RUBBER,RED EGG SPOON,PURPLE DRESS JEWELLERY STAND
4336,18283.0,DOUGHNUT LIP GLOSS,ICE CREAM PEN LIP GLOSS,ICE CREAM SUNDAE LIP GLOSS,BUBBLEGUM RING ASSORTED,WORLD WAR 2 GLIDERS ASSTD DESIGNS,CERAMIC STRAWBERRY CAKE MONEY BANK


In [32]:
recomendacion_cliente.to_csv(DIRECTORY + "data/processed/recomendation_name.csv", index=False)