#### Imports

In [10]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

## Filtering and Combining Desired Reviews

In [11]:
# Load the combined reviews data from the saved file
combined_reviews_df = pd.read_csv('processed_data/combined_reviews.csv', encoding='utf-8', engine='python')

# Filter the reviews with 4 and 5 stars
frequent_items_reviews = combined_reviews_df[combined_reviews_df['rating'].isin([4, 5])]

# Load the positive reviews from the sentiment analysis on the 3-star reviews
positive_reviews_df = pd.read_csv('processed_data/3_star_positive_reviews.csv', encoding='utf-8', engine='python')

# Combine the filtered 4 and 5-star reviews with the 3-star positive reviews
frequent_items_reviews = pd.concat([frequent_items_reviews, positive_reviews_df], ignore_index=True)

print(f"Total reviews to be used in frequent items: {len(frequent_items_reviews)}")

Total reviews to be used in frequent items: 693706


## Frequent Items

In [13]:
# Selecting users with at least 10 reviews
filtered_df = frequent_items_reviews.groupby('author_id').filter(lambda x: x['product_id'].nunique() > 9)

# Display the filtered DataFrame
print(f"Number of rows in the filtered dataframe: {len(filtered_df)}")
print(f"Number of unique authors in the filtered dataframe: {len(set(filtered_df['author_id']))}")
#set(filtered_df['author_id'])

Number of rows in the filtered dataframe: 95902
Number of unique authors in the filtered dataframe: 4613


In [14]:
# Combining all the products reviewed for each person in the dataset
selected_columns = filtered_df[['author_id', 'product_id']]

# Convert product_id to string before applying 'join'
combined_reviews = selected_columns.groupby('author_id')['product_id'].apply(lambda x: ' '.join(x.astype(str))).reset_index()

print(combined_reviews)

            author_id                                         product_id
0          1000235057  P420652 P421275 P297524 P456412 P232903 P37584...
1         10003868106  P420652 P480612 P375849 P375853 P472468 P37585...
2          1001087549  P309308 P423688 P270594 P433520 P232915 P45621...
3         10015807972  P420652 P427419 P454380 P480612 P427406 P42741...
4         10021044780  P309308 P442840 P466123 P500633 P418346 P47641...
...               ...                                                ...
4608        998162812  P269122 P429952 P471237 P443833 P422022 P37970...
4609        998179876  P309308 P429952 P270594 P442840 P466123 P44779...
4610        998853649  P456398 P461159 P461933 P456213 P429242 P41561...
4611       9990263118  P430337 P427406 P431180 P440307 P410400 P45567...
4612  orderGen1698648  P443833 P428095 P455926 P502656 P446423 P45536...

[4613 rows x 2 columns]


In [15]:
# Convert the 'reviews' into a list of transactions
transactions = combined_reviews['product_id'].str.split().tolist()

# Create a DataFrame for one-hot encoding
# Flatten all unique items (reviews) and create a unique item list
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_array, columns=te.columns_)

# Apply the Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.05, use_colnames=True)

# Focus on frequent itemsets containing only a single product (not pairs or larger sets)
frequent_pairs = frequent_itemsets[frequent_itemsets['itemsets'].apply(len) == 1]

# Generate association rules, with support as minimum of 3%
rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.03, num_itemsets=len(frequent_itemsets))

print(frequent_pairs)

     support   itemsets
0   0.076956  (P232915)
1   0.053111  (P269122)
2   0.263169  (P270594)
3   0.183178  (P309308)
4   0.127249  (P400259)
..       ...        ...
85  0.055929  (P505020)
86  0.094732  (P505023)
87  0.065901  (P505024)
88  0.100802  (P505031)
89  0.069586  (P505054)

[90 rows x 2 columns]


In [16]:
# Filter rules to include only those where antecedents have a single item
rules_filtered = rules[rules['antecedents'].apply(len) == 1].copy()

rules_display = rules_filtered[["antecedents", "consequents", "support", "confidence", "lift"]].copy()

# Convert frozensets to readable strings
rules_display["antecedents"] = rules_display["antecedents"].apply(lambda x: ', '.join(list(x)))
rules_display["consequents"] = rules_display["consequents"].apply(lambda x: ', '.join(list(x)))

# Sorting by confidence
rules_display = rules_display.sort_values(by="confidence", ascending=False)
print(rules_display.to_string(index=False))

antecedents      consequents  support  confidence      lift
    P483076          P482551 0.091914    1.000000 10.879717
    P482551          P483076 0.091914    1.000000 10.879717
    P505054          P505023 0.050076    0.719626  7.596420
    P500288          P500633 0.057446    0.716216  2.531728
    P501254          P500633 0.079991    0.706897  2.498785
    P501760          P500633 0.083026    0.691336  2.443779
    P471043          P476414 0.057230    0.685714  3.201619
    P500777          P500633 0.106005    0.677285  2.394113
    P482676          P500633 0.081726    0.669627  2.367042
    P479645          P500633 0.060698    0.658824  2.328853
    P500857          P500633 0.053544    0.656915  2.322106
    P477157          P476414 0.055712    0.652284  3.045534
    P474832          P476414 0.058530    0.636792  2.973202
    P442838          P476414 0.052894    0.635417  2.966778
    P505031          P500633 0.063083    0.625806  2.212142
    P500857          P270594 0.050726   

In [17]:
# let's try to quantify this
unique_values = pd.concat([rules_display['antecedents'], rules_display['consequents']]).nunique()
unique_values

65

## Testing

In [18]:
filtered_rules = rules_display[rules_display['antecedents'].apply(lambda x: 'P447596' in x)]
print(filtered_rules)

Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


In [19]:
filtered_rules = rules_display[rules_display['antecedents'].apply(lambda x: 'P500633' in x)]
print(filtered_rules)

    antecedents       consequents   support  confidence      lift
36      P500633           P270594  0.148277    0.524138  1.991638
199     P500633           P503936  0.127249    0.449808  2.108706
140     P500633           P476414  0.113809    0.402299  1.878345
188     P500633           P500777  0.106005    0.374713  2.394113
164     P500633           P479841  0.105571    0.373180  2.141144
86      P500633           P423688  0.102103    0.360920  1.369179
156     P500633           P479327  0.083893    0.296552  1.918644
194     P500633           P501760  0.083026    0.293487  2.443779
180     P500633           P482676  0.081726    0.288889  2.367042
289     P500633  P503936, P270594  0.079991    0.282759  2.312705
193     P500633           P501254  0.079991    0.282759  2.498785
62      P500633           P400259  0.073705    0.260536  2.047452
241     P500633  P270594, P476414  0.069152    0.244444  2.206697
68      P500633           P411365  0.068936    0.243678  2.161706
102     P5