#### Imports

In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

## Filtering and Combining Desired Reviews

In [2]:
# Load the combined reviews data from the saved file
combined_reviews_df = pd.read_csv('processed_data/combined_reviews.csv', encoding='utf-8', engine='python')

# Filter the reviews with 4 and 5 stars
frequent_items_reviews = combined_reviews_df[combined_reviews_df['rating'].isin([4, 5])]

# Load the positive reviews from the sentiment analysis on the 3-star reviews
positive_reviews_df = pd.read_csv('processed_data/positive_reviews.csv', encoding='utf-8', engine='python')

# Combine the filtered 4 and 5-star reviews with the 3-star positive reviews
frequent_items_reviews = pd.concat([frequent_items_reviews, positive_reviews_df], ignore_index=True)

print(f"Total reviews to be used in frequent items: {len(frequent_items_reviews)}")

Total reviews to be used in frequent items: 703118


## Frequent Items

In [3]:
# Selecting users with at least 10 reviews
filtered_df = frequent_items_reviews.groupby('author_id').filter(lambda x: x['product_id'].nunique() > 9)

# Display the filtered DataFrame
print(f"Number of rows in the filtered dataframe: {len(filtered_df)}")
print(f"Number of unique authors in the filtered dataframe: {len(set(filtered_df['author_id']))}")
#set(filtered_df['author_id'])

Number of rows in the filtered dataframe: 97626
Number of unique authors in the filtered dataframe: 4714


In [4]:
# Combining all the products reviewed for each person in the dataset
selected_columns = filtered_df[['author_id', 'product_id']]

# Convert product_id to string before applying 'join'
combined_reviews = selected_columns.groupby('author_id')['product_id'].apply(lambda x: ' '.join(x.astype(str))).reset_index()

print(combined_reviews)

            author_id                                         product_id
0          1000235057  P420652 P421275 P297524 P456412 P232903 P37584...
1         10003868106  P420652 P480612 P375849 P375853 P472468 P37585...
2          1001087549  P309308 P423688 P270594 P433520 P232915 P45621...
3         10015807972  P420652 P427419 P454380 P480612 P427406 P42741...
4         10021044780  P309308 P442840 P466123 P500633 P418346 P47641...
...               ...                                                ...
4709        998162812  P269122 P429952 P471237 P443833 P422022 P37970...
4710        998179876  P309308 P429952 P270594 P442840 P466123 P44779...
4711        998853649  P456398 P461159 P461933 P456213 P429242 P41561...
4712       9990263118  P430337 P427406 P431180 P440307 P410400 P45567...
4713  orderGen1698648  P443833 P428095 P455926 P502656 P446423 P45536...

[4714 rows x 2 columns]


In [5]:
# Convert the 'reviews' into a list of transactions
transactions = combined_reviews['product_id'].str.split().tolist()

# Create a DataFrame for one-hot encoding
# Flatten all unique items (reviews) and create a unique item list
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_array, columns=te.columns_)

# Apply the Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.05, use_colnames=True)

# Focus on frequent itemsets containing only a single product (not pairs or larger sets)
frequent_pairs = frequent_itemsets[frequent_itemsets['itemsets'].apply(len) == 1]

# Generate association rules, with support as minimum of 3%
rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.03, num_itemsets=len(frequent_itemsets))

print(frequent_pairs)

     support   itemsets
0   0.075732  (P232915)
1   0.052397  (P269122)
2   0.258591  (P270594)
3   0.181799  (P309308)
4   0.124947  (P400259)
..       ...        ...
83  0.054731  (P505020)
84  0.093339  (P505023)
85  0.064701  (P505024)
86  0.099067  (P505031)
87  0.068519  (P505054)

[88 rows x 2 columns]


In [6]:
# Filter rules to include only those where antecedents have a single item
rules_filtered = rules[rules['antecedents'].apply(len) == 1].copy()

rules_display = rules_filtered[["antecedents", "consequents", "support", "confidence", "lift"]].copy()

# Convert frozensets to readable strings
rules_display["antecedents"] = rules_display["antecedents"].apply(lambda x: ', '.join(list(x)))
rules_display["consequents"] = rules_display["consequents"].apply(lambda x: ', '.join(list(x)))

# Sorting by confidence
rules_display = rules_display.sort_values(by="confidence", ascending=False)
print(rules_display.to_string(index=False))

antecedents      consequents  support  confidence      lift
    P483076          P482551 0.089945    1.000000 11.117925
    P482551          P483076 0.089945    1.000000 11.117925
    P500288          P500633 0.056216    0.716216  2.577285
    P501254          P500633 0.078490    0.704762  2.536067
    P501760          P500633 0.081672    0.692446  2.491749
    P471043          P476414 0.056428    0.685567  3.254545
    P500777          P500633 0.104370    0.678621  2.441998
    P482676          P500633 0.081035    0.673721  2.424368
    P500857          P500633 0.053246    0.662269  2.383158
    P479645          P500633 0.059610    0.658080  2.368082
    P477157          P476414 0.054731    0.653165  3.100723
    P442838          P476414 0.051973    0.634715  3.013139
    P474832          P476414 0.057276    0.633803  3.008808
    P505031          P500633 0.062155    0.627409  2.257715
    P500857          P270594 0.050276    0.625330  2.418216
    P455236          P476414 0.055155   

In [7]:
# let's try to quantify this
unique_values = pd.concat([rules_display['antecedents'], rules_display['consequents']]).nunique()
unique_values

62

## Testing

In [8]:
filtered_rules = rules_display[rules_display['antecedents'].apply(lambda x: 'P447596' in x)]
print(filtered_rules)

Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


In [9]:
filtered_rules = rules_display[rules_display['antecedents'].apply(lambda x: 'P500633' in x)]
print(filtered_rules)

    antecedents       consequents   support  confidence      lift
34      P500633           P270594  0.145524    0.523664  2.025064
184     P500633           P503936  0.125371    0.451145  2.154709
130     P500633           P476414  0.111795    0.402290  1.909764
174     P500633           P500777  0.104370    0.375573  2.441998
152     P500633           P479841  0.103734    0.373282  2.185905
78      P500633           P423688  0.100552    0.361832  1.394666
144     P500633           P479327  0.082520    0.296947  1.952310
180     P500633           P501760  0.081672    0.293893  2.491749
166     P500633           P482676  0.081035    0.291603  2.424368
273     P500633  P503936, P270594  0.078702    0.283206  2.362891
179     P500633           P501254  0.078490    0.282443  2.536067
56      P500633           P400259  0.072338    0.260305  2.083327
225     P500633  P270594, P476414  0.068095    0.245038  2.251676
60      P500633           P411365  0.067883    0.244275  2.205961
92      P5