### Original Notebook from cdeotte can be found [here](https://www.kaggle.com/code/cdeotte/customers-who-bought-this-frequently-buy-this).

### [Here's](https://www.kaggle.com/code/cdeotte/recommend-items-purchased-together-0-021/comments#1703595) where cdeotte gives the code for running it on the entire dataset.  
### And [here's](https://www.kaggle.com/code/cdeotte/recommend-items-purchased-together-0-021/comments#1732941) where he mentions that it could probably be rewritten to be a lot quicker.    

In [None]:
import pandas as pd
import cudf
import pickle as pkl

In [None]:
%%time
# load transactions
t = cudf.read_csv(
    "../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv"
    ,usecols=["customer_id", "article_id"]
)
t = t.drop_duplicates()
t["article_id"] = t["article_id"].astype("int32")

# convert customer_id field in transactions
c = cudf.read_csv(
    "../input/h-and-m-personalized-fashion-recommendations/customers.csv",
    usecols=["customer_id"]
)
c_id_to_index = c.reset_index().set_index("customer_id")["index"]
t["customer_id"] = t["customer_id"].map(c_id_to_index)
t["customer_id"] = t["customer_id"].astype("int32")
del c, c_id_to_index

# create pair_transactions copy
pairs_t = t.copy()
pairs_t.columns = ["customer_id", "pair_id"]

# unique articles
unique_articles = t["article_id"].unique()

In [None]:
%%time

batch_size = 5000

batch_pairs_dfs = []

for i in range(0, len(unique_articles), batch_size):
    print(f"processing article #{i:,} to #{i+batch_size:,}")

    # take batch of articles
    batch_articles = unique_articles[i:i+batch_size]

    # get all pairs for those articles (other articles those customers bought)
    batch_t = t[t["article_id"].isin(batch_articles)]
    batch_pairs_df = batch_t.merge(pairs_t, on="customer_id")

    # delete same-article pairs
    same_article_row_idxs = batch_pairs_df.query("article_id==pair_id").index
    batch_pairs_df = batch_pairs_df.drop(same_article_row_idxs)
    
    # delete single customer articles
    c1s = (
        batch_pairs_df.groupby("article_id")[["customer_id"]].nunique()
        .query("customer_id==1").index
    )
    single_customer_row_idxs = batch_pairs_df[batch_pairs_df["article_id"].isin(c1s)].index
    batch_pairs_df = batch_pairs_df.drop(single_customer_row_idxs)
    
    # get sorted counts of article-pair occurences
    batch_pairs_df = batch_pairs_df.groupby(["article_id", "pair_id"])[["customer_id"]].count()
    batch_pairs_df.columns = ["pair_counts"]
    batch_pairs_df = batch_pairs_df.reset_index()
    batch_pairs_df = batch_pairs_df.sort_values(["article_id", "pair_counts"], ascending=False)

    # get top one for each article (need pandas)
    batch_pairs_df = batch_pairs_df.to_pandas().groupby("article_id").head(1)
    # back to cudf
    batch_pairs_df = cudf.DataFrame(batch_pairs_df.set_index("article_id")[["pair_id"]])
    batch_pairs_dfs.append(batch_pairs_df)
    
all_article_pairs_df = cudf.concat(batch_pairs_dfs)
print(len(all_article_pairs_df))

In [None]:
%%time
article_pairs_dict = all_article_pairs_df["pair_id"].to_pandas().to_dict()
with open("article_pairs_dict.pkl", "wb") as f:
    pkl.dump(article_pairs_dict, f)