

*  NAME : Mayuresh Shailesh Rane
*  PRN : 123B1B241
*  BATCH : D2
*  **Assignment 4**



In [None]:
pip install pandas mlxtend



In [None]:
# Import necessary libraries
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
import time
import warnings

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

# =============================================================================
# Part 1: Data Loading
# =============================================================================
try:
    # Load the single, combined dataset file using 'ISO-8859-1' encoding
    retail_df = pd.read_csv('/content/online_retail_II.csv', encoding='ISO-8859-1')
    print("Dataset loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please make sure 'online_retail_II.csv' is in the same directory as this script.")
    exit()

# =============================================================================
# Part 2: Data Cleaning and Preprocessing
# =============================================================================
print("\n--- Starting Data Cleaning ---")

# 1. Handle Missing Values
retail_df.dropna(axis=0, subset=['Invoice', 'Description'], inplace=True)

# 2. Correct Data Types
retail_df['Invoice'] = retail_df['Invoice'].astype('str')

# 3. Remove Canceled Transactions
retail_df = retail_df[~retail_df['Invoice'].str.startswith('C')]

# 4. Remove Duplicate Rows
retail_df.drop_duplicates(inplace=True)

# 5. Handle Data Anomalies (e.g., non-positive prices)
retail_df = retail_df[retail_df['Price'] > 0]

# 6. Clean Text Data
retail_df['Description'] = retail_df['Description'].str.strip()

# 7. Remove Non-Product Entries
non_product_items = ['POSTAGE', 'MANUAL', 'DOTCOM POSTAGE', 'BANK CHARGES', 'AMAZON FEE', 'CRUK COMMISSION', 'SAMPLES', 'CARRIAGE']
retail_df = retail_df[~retail_df['Description'].str.upper().isin(non_product_items)]

print("--- Data Cleaning Complete ---")

# =============================================================================
# Part 3: Data Scoping and Basket Creation
# =============================================================================
print("\nCreating transaction baskets for the UK...")
country_filter = 'United Kingdom'
basket_uk = retail_df[retail_df['Country'] == country_filter]

# Group by 'Invoice' and aggregate the 'Description' of items into a list for each transaction
transactions = basket_uk.groupby('Invoice')['Description'].apply(list).values.tolist()
print("Basket creation complete.")

# =============================================================================
# Part 4: Data Transformation for FP-Growth (One-Hot Encoding)
# =============================================================================
print("\nTransforming data into one-hot encoded format...")
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
print("Data transformation complete.")

# =============================================================================
# Part 5: Finding Frequent Itemsets with FP-Growth
# =============================================================================
print("\nStarting frequent itemset mining with FP-Growth... (This is the main calculation step)")
start_time = time.time()  # Start timer

# Use fpgrowth instead of apriori for much better performance
frequent_itemsets = fpgrowth(df_encoded, min_support=0.01, use_colnames=True)

end_time = time.time()  # End timer
print(f"Mining finished in {end_time - start_time:.2f} seconds.")

frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)

# =============================================================================
# Part 6: Generating and Displaying Association Rules
# =============================================================================
print("\nGenerating association rules...")
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Filter for rules with higher lift and confidence for stronger associations
min_confidence = 0.2
min_lift = 4
rules = rules[(rules['confidence'] > min_confidence) & (rules['lift'] > min_lift)]

# Sort the rules by lift and confidence to see the strongest relationships first
rules = rules.sort_values(by=['lift', 'confidence'], ascending=[False, False])

# Clean up the output for better presentation by converting frozensets to strings
rules['antecedents'] = rules['antecedents'].apply(lambda a: ', '.join(list(a)))
rules['consequents'] = rules['consequents'].apply(lambda a: ', '.join(list(a)))

# =============================================================================
# Part 7: Final Output
# =============================================================================
print(f"\n--- Analysis for: {country_filter} ---")
print(f"Total Transactions Analyzed: {len(transactions)}")
print(f"Found {len(frequent_itemsets)} frequent itemsets (support > 0.01).")
print(f"Found {len(rules)} strong association rules (lift > {min_lift}, confidence > {min_confidence}).")
print("\n--- Top 20 Strongest Association Rules ---")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(20).reset_index(drop=True))

Dataset loaded successfully.

--- Starting Data Cleaning ---
--- Data Cleaning Complete ---

Creating transaction baskets for the UK...
Basket creation complete.

Transforming data into one-hot encoded format...
Data transformation complete.

Starting frequent itemset mining with FP-Growth... (This is the main calculation step)
Mining finished in 67.39 seconds.

Generating association rules...

--- Analysis for: United Kingdom ---
Total Transactions Analyzed: 36215
Found 1102 frequent itemsets (support > 0.01).
Found 695 strong association rules (lift > 4, confidence > 0.2).

--- Top 20 Strongest Association Rules ---
                            antecedents                          consequents  \
0             POPPY'S PLAYHOUSE BEDROOM         POPPY'S PLAYHOUSE LIVINGROOM   
1          POPPY'S PLAYHOUSE LIVINGROOM            POPPY'S PLAYHOUSE BEDROOM   
2             POPPY'S PLAYHOUSE KITCHEN         POPPY'S PLAYHOUSE LIVINGROOM   
3          POPPY'S PLAYHOUSE LIVINGROOM            POP