# Task 1: "Finding association rules and revealing hidden structures in data"

In [None]:
import pandas as pd

from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder

from mlxtend.frequent_patterns import association_rules

import networkx as nx
import matplotlib.pyplot as plt

from networkx import clustering

from sklearn.decomposition import NMF
import seaborn as sns

from minisom import MiniSom

# Normalize the purchase matrix
from sklearn.preprocessing import MinMaxScaler

import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

## Subtask 1: Load the Data, Determine Unique Counts

Load the file "TRANSACTION.csv". 
It has three columns: 
- Customer - customer ID, 
- Product - purchase,
- Time - timestamp (not needed for the task).

Determine (by writing the corresponding code) how many different values ​​
the variables Product and Customer take.

In [None]:
# Load the dataset
df = pd.read_csv("../data/TRANSACTION.csv")

# Determine the number of unique Customers and Products
unique_customers = df["CUSTOMER"].nunique()
unique_products = df["PRODUCT"].nunique()

display(df.head())
print(f"Number of unique customers: {unique_customers}")
print(f"Number of unique products: {unique_products}")

## Subtask 2: Find Frequent Episodes using FPTree Algorithm

Find frequent episodes with a rule size constraint of 4 using the FPTree 
algorithm and a support threshold of 5%.

In [None]:
# Prepare the data for FPGrowth
transactions = df.groupby("CUSTOMER")["PRODUCT"].apply(list).tolist()
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Find frequent itemsets with support threshold of 5% and max_len=4
frequent_itemsets: pd.DataFrame = fpgrowth(
    df_encoded, min_support=0.05, use_colnames=True, max_len=4
)

print(frequent_itemsets)

## Subtask 3: Find the Largest Frequent Episode containing "peppers"

Find the largest (most elements) frequent episode containing the product 
"peppers". What support does it have?

In [None]:
# Filter itemsets containing 'peppers'
peppers_itemsets = frequent_itemsets[
    frequent_itemsets["itemsets"].apply(lambda x: "peppers" in x)
]

# Find the itemset with the most elements
largest_peppers_itemset = peppers_itemsets.loc[
    peppers_itemsets["itemsets"].apply(len).idxmax()
]
print(
    f"Largest frequent episode containing 'peppers': {largest_peppers_itemset['itemsets']}"
)
print(f"Support: {largest_peppers_itemset['support']}")

## Subtask 4: Construct Association Rules with Reliability Threshold of 30%

Based on the frequent episodes found, construct association rules with a 
reliability threshold of 30%. 

Find the rule with the maximum lift, containing the product "peppers" in the 
left part of the rule. 

Give it a written verbal interpretation, specify and explain its numerical 
indicators: support, reliability and lift.

In [None]:
# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)

# Filter rules with 'peppers' in the left part
peppers_rules = rules[rules["antecedents"].apply(lambda x: "peppers" in x)]

# Find the rule with the maximum lift
max_lift_rule = peppers_rules.loc[peppers_rules["lift"].idxmax()]
print("Rule with maximum lift containing 'peppers' in the antecedent:")
print(f"{max_lift_rule['antecedents']} -> {max_lift_rule['consequents']}")
print(f"Support: {max_lift_rule['support']}")
print(f"Confidence: {max_lift_rule['confidence']}")
print(f"Lift: {max_lift_rule['lift']}")

### Verbal interpretation of the rule:
The rule **{'peppers', 'avocado'} -> {'sardines', 'apples'}** means that if a
transaction (purchase) includes the products **peppers** and **avocado**, then
there is a high probability that **sardines** and **apples** will also be
present in the same transaction. This rule describes the associative connection
between these products.

### Description of numerical indicators:

1. **Support: 8.99%**
    - This is the proportion of transactions in which all products from the rule are simultaneously present: **peppers, avocado, sardines, apples**.
    - In this case, 8.99% of all transactions contain all four products.
    - **Conclusion:** This is not very high support, which suggests that this combination of products is relatively rare. However, it can be useful for identifying niche but meaningful associations.

2. **Confidence: 71.43%**
    - This is the probability that a transaction that contains **peppers** and **avocado** will also contain **sardines** and **apples**.
    - In this case, 71.43% of transactions with **peppers** and **avocado** also contain **sardines** and **apples**.
    - **Conclusion:** High confidence indicates a strong association between these products. This rule can be considered reliable for prediction.

3. **Lift: 5.67**
    - The lift shows how much more common the combination of **peppers, avocado, sardines, apples** is compared to if these products were independent of each other.
    - A lift value greater than 1 (in this case 5.6746) indicates a positive correlation between the products.
    - **Conclusion:** The lift is significantly greater than 1, indicating a strong association between **peppers, avocado** and **sardines, apples**. This means that the presence of **peppers** and **avocado** increases the likelihood of buying **sardines** and **apples** by 5.67 times compared to a random coincidence.

### Conclusions based on the values:
- **The rule has high reliability (71.43%)**, making it useful for forecasting. - **Lift (5.6746)** indicates a strong association between the products, making this rule meaningful to analyze.
- **Support (8.99%)** is not very high, but this may be due to the fact that this combination of products is niche.

## Subtask 5: Construct Directed Graph from Two-Place Rules

Using only two-place rules, construct a directed graph where:
- the **vertices are the elements of the rule**, their color (or size) is the 
item support, 
- the **arcs are the implications** (oriented in the direction from the 
condition to the consequence), and the arc weights are the reliability.

In [None]:
# Filter rules with single antecedent and single consequent
two_place_rules = rules[
    (rules["antecedents"].apply(len) == 1) & (rules["consequents"].apply(len) == 1)
]

G = nx.DiGraph()

# Add nodes with support as size
supports = df["PRODUCT"].value_counts(normalize=True)
for product, support in supports.items():
    G.add_node(product, size=support)

# Add edges with confidence as weight
for _, row in two_place_rules.iterrows():
    antecedent = next(iter(row["antecedents"]))
    consequent = next(iter(row["consequents"]))
    G.add_edge(antecedent, consequent, weight=row["confidence"])

# Draw the graph
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, k=0.5)
sizes = [G.nodes[node]["size"] * 1000 for node in G.nodes()]
weights = [G[u][v]["weight"] for u, v in G.edges()]
nx.draw_networkx_nodes(G, pos, node_size=sizes, node_color="skyblue")
nx.draw_networkx_edges(G, pos, width=weights, edge_color="gray", arrows=True)
nx.draw_networkx_labels(
    G,
    pos,
    font_size=8,
    verticalalignment="top",
)
plt.title("Directed Graph of Two-Place Association Rules")
plt.axis("off")
plt.show()

## Subtask 6: Calculate Centrality Measures and Clustering Coefficient

For this graph, calculate the centrality measures according to "Clust. coef" 
and find the element with the highest measure, as well as what measure the 
product "peppers" has.


In [None]:
# Calculate clustering coefficient
clust_coef = clustering(G)

# Find the element with the highest clustering coefficient
max_clust_element = max(clust_coef, key=clust_coef.get)
print(
    f"Element with highest clustering coefficient: {max_clust_element} ({clust_coef[max_clust_element]})"
)

# Clustering coefficient of 'peppers'
peppers_clust = clust_coef.get("peppers", 0)
print(f"Clustering coefficient of 'peppers': {peppers_clust}")

## Subtask 7: Build Numeric Purchase Matrix

Build a numeric matrix with purchase counters in cells, customers in rows, and 
products in columns.

In [None]:
purchase_matrix = df.pivot_table(
    index="CUSTOMER", columns="PRODUCT", aggfunc="size", fill_value=0
)
display(purchase_matrix.head())

## Subtask 8: NMF Linear Projection

Using the "NMF" method, plot a linear projection of the data set onto a plane 
(2 components) and color code the transactions containing the product "peppers".

In [None]:
# Apply NMF
nmf = NMF(n_components=2, random_state=42)
nmf_features = nmf.fit_transform(purchase_matrix)

# Create a DataFrame for plotting
nmf_df = pd.DataFrame(nmf_features, columns=["Component 1", "Component 2"])
nmf_df["Contains_peppers"] = purchase_matrix["peppers"] > 0

# Plot the projection
plt.figure(figsize=(10, 7))
sns.scatterplot(
    x="Component 1",
    y="Component 2",
    hue="Contains_peppers",
    data=nmf_df,
    palette=["blue", "red"],
)
plt.title("NMF Linear Projection with 'peppers' Highlighted")
plt.show()

## Subtask 9: SOM Nonlinear Projection

Using the "SOM" method, construct a nonlinear projection of the data set onto a 
plane; color-code the transactions containing the product "peppers". 
Parameters not specified in the task (for example, the lattice size for the SOM
or the number of layers in the autoencoder) can be chosen at your discretion to
obtain the most convenient visualization. Provide a written comment on how, from
 your point of view, a nonlinear projection is better or worse for your example.

In [None]:
# Assuming 'scaled_matrix' is your preprocessed data matrix
scaler = MinMaxScaler()
scaled_matrix = scaler.fit_transform(purchase_matrix)

# Initialize the SOM
som = MiniSom(
    x=10, y=10, input_len=scaled_matrix.shape[1], sigma=1.0, learning_rate=0.5
)
som.random_weights_init(scaled_matrix)
som.train_random(scaled_matrix, 1000)

# Mapping vectors to their winning neurons manually
mapped = np.array([som.winner(x) for x in scaled_matrix])

# Create a DataFrame for plotting
som_df = pd.DataFrame(mapped, columns=["x", "y"])

# Optional: Visualize the SOM
plt.figure(figsize=(10, 10))
plt.pcolor(som.distance_map().T, cmap="Blues")  # Distance map as background
plt.colorbar()

plt.title("Self-Organizing Map")
plt.show()

# Comment on SOM projection
print(
    """
    The SOM nonlinear projection allows capturing complex relationships
    between products, potentially revealing clusters that linear methods like
    NMF might miss. However, it may require careful parameter tuning to 
    achieve meaningful visualizations."""
)

## Subtask 10: Select 6 Independent Variables using Stepwise Selection

From the original matrix (from point 7), according to your option, select 6 
independent variables using any of the methods: VarClus, Glasso or step-by-step.

In [None]:
# Example: Selecting variables based on predicting 'peppers'
# Create target variable
purchase_matrix["peppers_flag"] = purchase_matrix["peppers"] > 0

# Features and target
X = purchase_matrix.drop(["peppers", "peppers_flag"], axis=1)
y = purchase_matrix["peppers_flag"]

# Initialize model
model = LogisticRegression(max_iter=1000)

# Initialize RFE
rfe = RFE(model, n_features_to_select=6)
fit = rfe.fit(X, y)

# Selected features
selected_features = X.columns[fit.support_]
print("Selected independent variables:", list(selected_features))