In [98]:
import warnings
import pandas as pd
from surprise import Dataset, Reader, KNNWithMeans

warnings.filterwarnings("ignore")

Testing
https://realpython.com/build-recommendation-engine-collaborative-filtering/

In [14]:
data = pd.read_csv("data/train.csv")

In [15]:
cp_affection= data[["customer_id","product_id"]].groupby(by=["customer_id","product_id"], as_index=False).value_counts().sort_values(by=["customer_id","count"])

print(cp_affection.shape)
print(len(cp_affection.customer_id.unique()), len(cp_affection.product_id.unique()))

cp_affection.rename(columns={"count": "times_bought"}, inplace=True)
cp_affection.head()

(47842, 3)
2000 100


Unnamed: 0,customer_id,product_id,times_bought
1,0,10,1
3,0,28,1
5,0,33,1
6,0,34,1
10,0,61,1


In [144]:
# TODO: transactions may need to be scaled per user
affection_scaled = pd.DataFrame(columns=cp_affection.columns)
scaling_factor=10
for customer in cp_affection.customer_id.unique():
    tmp = cp_affection.copy()[cp_affection.customer_id == customer]
    X = tmp.times_bought
    tmp["times_bought_scaled"] = round(((X - X.min())/(X.max() - X.min()))*scaling_factor,2)
    affection_scaled = pd.concat([affection_scaled, tmp], ignore_index=True)

affection_scaled = affection_scaled[["customer_id", "product_id", "times_bought_scaled"]]
print(affection_scaled.shape)
affection_scaled.head()

(47842, 3)


Unnamed: 0,customer_id,product_id,times_bought_scaled
0,0,10,0.0
1,0,28,0.0
2,0,33,0.0
3,0,34,0.0
4,0,61,0.0


In [145]:
reader = Reader(rating_scale=(0, scaling_factor))

In [146]:
df_test = affection_scaled.sample(frac=0.20, random_state=1)
df_training = affection_scaled[~affection_scaled.index.isin(df_test.index)]
print(df_test.shape, df_training.shape)

(9568, 3) (38274, 3)


In [147]:
rec_data = Dataset.load_from_df(df_training, reader)

In [148]:
# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
rec_model = KNNWithMeans(sim_options=sim_options)

In [149]:
trainingSet = rec_data.build_full_trainset()

In [150]:
rec_model.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1423c6a30>

In [151]:
predictions_test = df_test[["customer_id",	"product_id"]].apply(lambda x: rec_model.predict(x.customer_id, x.product_id).est, axis=1)

In [152]:
from sklearn.metrics import mean_squared_error
y_true = df_test.times_bought_scaled
y_pred = predictions_test
mean_squared_error(y_true, y_pred)

6.84421467054881

In [153]:
y_true.mean()

2.0018237876254177

In [154]:
y_pred.mean()

2.0472046607229877

In [155]:
CUSTOMERS = 2000
PRODUCTS = 100

# Build customer product pairs
customer_product_pair_gen = (
    (c, p) for c in range(CUSTOMERS) for p in range(PRODUCTS)
)
skeleton_cp = pd.DataFrame(
    customer_product_pair_gen, columns=["customer_id", "product_id"]
)
skeleton_cp.shape

(200000, 2)

In [158]:
skeleton_cp["ratings"]=skeleton_cp.apply(lambda x: rec_model.predict(x.customer_id, x.product_id).est, axis=1)

In [159]:
skeleton_cp.head()

Unnamed: 0,customer_id,product_id,ratings
0,0,0,1.538011
1,0,1,1.330337
2,0,2,1.639669
3,0,3,1.535176
4,0,4,1.890543
