<a href="https://colab.research.google.com/github/rahitya-123/Classification-and-Regression-Trees---Statistics/blob/main/Association_Rule_Mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Association Rule Mining for Auto-MPG Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For association rule mining
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")

#----------------------------------------------------------------
# Data Loading and Preprocessing
#----------------------------------------------------------------
print("=== DATA LOADING AND PREPROCESSING ===")

# Define column names based on the dataset description
columns = [
    'mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
    'acceleration', 'model_year', 'origin', 'car_name'
]

# Read the data directly from the auto-mpg.data file
try:
    df = pd.read_csv(
        'auto-mpg.data',
        delim_whitespace=True,
        names=columns,
        na_values='?',
        quotechar='"',
        comment='\t',
        skipinitialspace=True
    )
    print("Loaded auto-mpg.data file successfully")
except:
    print("Error: Could not find auto-mpg.data file")
    print("Please make sure the dataset file is in the current directory.")
    exit()

# Convert horsepower to numeric if needed
if df['horsepower'].dtype == object:
    df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')

# Handle missing values
df['horsepower'].fillna(df['horsepower'].mean(), inplace=True)

# Add region names based on origin codes
origin_names = {1: 'American', 2: 'European', 3: 'Japanese'}
df['region'] = df['origin'].map(origin_names)

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

# Basic data exploration
print("\nSummary statistics:")
print(df.describe())

print("\nMissing values:")
print(df.isna().sum())

# Visualize relationships between key features
plt.figure(figsize=(15, 10))

# Plot 1: MPG distribution by region
plt.subplot(2, 2, 1)
sns.boxplot(x='region', y='mpg', data=df)
plt.title('MPG Distribution by Region')
plt.xlabel('Region')
plt.ylabel('MPG')

# Plot 2: MPG vs Weight
plt.subplot(2, 2, 2)
sns.scatterplot(x='weight', y='mpg', hue='region', data=df, alpha=0.7)
plt.title('MPG vs Weight by Region')
plt.xlabel('Weight')
plt.ylabel('MPG')

# Plot 3: MPG vs Horsepower
plt.subplot(2, 2, 3)
sns.scatterplot(x='horsepower', y='mpg', hue='region', data=df, alpha=0.7)
plt.title('MPG vs Horsepower by Region')
plt.xlabel('Horsepower')
plt.ylabel('MPG')

# Plot 4: MPG over the years
plt.subplot(2, 2, 4)
year_avg = df.groupby('model_year')['mpg'].mean().reset_index()
sns.lineplot(x='model_year', y='mpg', data=year_avg, marker='o')
plt.title('Average MPG by Model Year')
plt.xlabel('Model Year')
plt.ylabel('Average MPG')

plt.tight_layout()
plt.savefig('auto_mpg_eda.png')
plt.close()

#----------------------------------------------------------------
# ASSOCIATION RULE MINING
#----------------------------------------------------------------
print("\n=== ASSOCIATION RULE MINING ===")

# Create meaningful categorical variables for association rule mining
print("Creating categorical variables for association rule mining...")

# Discretize mpg into categories
df['mpg_category'] = pd.cut(
    df['mpg'],
    bins=[0, 15, 20, 25, 30, 50],
    labels=['very_low_mpg', 'low_mpg', 'medium_mpg', 'high_mpg', 'very_high_mpg']
)

# Discretize horsepower
df['hp_category'] = pd.cut(
    df['horsepower'],
    bins=[0, 75, 100, 150, 250],
    labels=['low_hp', 'medium_hp', 'high_hp', 'very_high_hp']
)

# Discretize weight
df['weight_category'] = pd.cut(
    df['weight'],
    bins=[0, 2000, 3000, 4000, 6000],
    labels=['light', 'medium', 'heavy', 'very_heavy']
)

# Discretize cylinders
df['cylinders_category'] = df['cylinders'].apply(lambda x: f"{x}_cylinders")

# Create decade category based on model year
df['decade'] = df['model_year'].apply(lambda x: f"{'70s' if x < 80 else '80s'}")

# Combine the categorical columns we want to analyze for association rules
categorical_columns = ['mpg_category', 'hp_category', 'weight_category', 'cylinders_category', 'region', 'decade']

# Preview the categorical data
print("Preview of categorical data:")
print(df[categorical_columns].head())

# Distribution of each categorical variable
plt.figure(figsize=(15, 15))
for i, col in enumerate(categorical_columns):
    plt.subplot(3, 2, i+1)
    sns.countplot(y=col, data=df, order=df[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.tight_layout()
plt.savefig('categorical_distributions.png')
plt.close()

# Create a list of transactions
transactions = []
for _, row in df[categorical_columns].iterrows():
    transaction = [f"{col}_{val}" for col, val in row.items() if pd.notna(val)]
    transactions.append(transaction)

# Use TransactionEncoder to convert the transactions to a binary matrix
te = TransactionEncoder()
te_ary = te.fit_transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Generate frequent itemsets
print("\nGenerating frequent itemsets...")
frequent_itemsets = apriori(df_encoded, min_support=0.1, use_colnames=True)
print(f"Found {len(frequent_itemsets)} frequent itemsets with min_support=0.1")

# Display top itemsets by support
print("\nTop 10 frequent itemsets by support:")
print(frequent_itemsets.sort_values('support', ascending=False).head(10))

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
print(f"Found {len(rules)} association rules with min_confidence=0.7")

# Display top rules by lift
print("\nTop 10 association rules by lift:")
pd.set_option('display.max_colwidth', None)  # To display full antecedents and consequents
print(rules.sort_values('lift', ascending=False).head(10)[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

# Plot distribution of supports and confidences
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(rules['support'], bins=20, edgecolor='black')
plt.title('Distribution of Support')
plt.xlabel('Support')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
plt.hist(rules['confidence'], bins=20, edgecolor='black')
plt.title('Distribution of Confidence')
plt.xlabel('Confidence')
plt.ylabel('Count')

plt.tight_layout()
plt.savefig('association_rule_metrics.png')
plt.close()

# Scatter plot of support vs confidence, colored by lift
plt.figure(figsize=(10, 6))
scatter = plt.scatter(rules['support'], rules['confidence'], c=rules['lift'],
                      cmap='viridis', alpha=0.6, s=rules['lift']*20)
plt.colorbar(scatter, label='Lift')
plt.title('Support vs Confidence (colored by lift)')
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.grid(True, alpha=0.3)
plt.savefig('support_vs_confidence.png')
plt.close()

# Find specific rules related to fuel efficiency
mpg_rules = rules[rules['consequents'].apply(lambda x: any('mpg_category' in item for item in x))]
print(f"\nFound {len(mpg_rules)} rules that predict mpg categories")

# Display top mpg rules by lift
print("\nTop 5 rules for predicting MPG by lift:")
print(mpg_rules.sort_values('lift', ascending=False).head(5)[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

# Find rules related to car origin/region
region_rules = rules[rules['consequents'].apply(lambda x: any('region' in item for item in x))]
print(f"\nFound {len(region_rules)} rules that predict region")

# Display top region rules by confidence
print("\nTop 5 rules for predicting region by confidence:")
print(region_rules.sort_values('confidence', ascending=False).head(5)[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

# Examine rules for specific combinations
# For example, rules that lead to high MPG
high_mpg_rules = rules[rules['consequents'].apply(lambda x: any(item in ['mpg_category_high_mpg', 'mpg_category_very_high_mpg'] for item in x))]
print(f"\nFound {len(high_mpg_rules)} rules that predict high or very high MPG")

if len(high_mpg_rules) > 0:
    print("\nTop 5 rules for predicting high MPG by confidence:")
    print(high_mpg_rules.sort_values('confidence', ascending=False).head(5)[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

# Create a network visualization of the strongest rules
# For simplicity, we'll just take top rules by lift
top_rules = rules.sort_values('lift', ascending=False).head(15)

# Convert frozensets to strings for visualization
def format_fs(fs):
    return ', '.join(list(fs))

# Create a new DataFrame for visualization
viz_rules = top_rules.copy()
viz_rules['antecedents_str'] = viz_rules['antecedents'].apply(format_fs)
viz_rules['consequents_str'] = viz_rules['consequents'].apply(format_fs)

print("\nTop 15 strongest association rules (by lift):")
for i, row in viz_rules.iterrows():
    print(f"{row['antecedents_str']} => {row['consequents_str']} "
          f"(Support: {row['support']:.3f}, Confidence: {row['confidence']:.3f}, Lift: {row['lift']:.3f})")

print("\nAssociation rule mining complete. Output files saved.")

=== DATA LOADING AND PREPROCESSING ===
Loaded auto-mpg.data file successfully
Dataset shape: (398, 10)

First few rows:
    mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  18.0          8         307.0       130.0  3504.0          12.0   
1  15.0          8         350.0       165.0  3693.0          11.5   
2  18.0          8         318.0       150.0  3436.0          11.0   
3  16.0          8         304.0       150.0  3433.0          12.0   
4  17.0          8         302.0       140.0  3449.0          10.5   

   model_year  origin                   car_name    region  
0          70       1  chevrolet chevelle malibu  American  
1          70       1          buick skylark 320  American  
2          70       1         plymouth satellite  American  
3          70       1              amc rebel sst  American  
4          70       1                ford torino  American  

Summary statistics:
              mpg   cylinders  displacement  horsepower       weight  \


  df = pd.read_csv(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['horsepower'].fillna(df['horsepower'].mean(), inplace=True)



=== ASSOCIATION RULE MINING ===
Creating categorical variables for association rule mining...
Preview of categorical data:
   mpg_category   hp_category weight_category cylinders_category    region  \
0       low_mpg       high_hp           heavy        8_cylinders  American   
1  very_low_mpg  very_high_hp           heavy        8_cylinders  American   
2       low_mpg       high_hp           heavy        8_cylinders  American   
3       low_mpg       high_hp           heavy        8_cylinders  American   
4       low_mpg       high_hp           heavy        8_cylinders  American   

  decade  
0    70s  
1    70s  
2    70s  
3    70s  
4    70s  

Generating frequent itemsets...
Found 166 frequent itemsets with min_support=0.1

Top 10 frequent itemsets by support:
     support                                           itemsets
3   0.776382                                       (decade_70s)
14  0.625628                                  (region_American)
52  0.525126                 