# Load Data and Feature Engineering

In [None]:
import pandas as pd

bank_df = pd.read_csv('bank.csv', delimiter=';')

file_path = 'adult.data'
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'income'
]
adult_df = pd.read_csv(file_path, names=columns, na_values="?", skipinitialspace=True)

# Binary indicators for capital gains and losses
adult_df['positive_capital_gain'] = (adult_df['capital-gain'] > 0).astype(int)
adult_df['positive_capital_loss'] = (adult_df['capital-loss'] > 0).astype(int)

# Interaction feature between age and education-num
adult_df['age_education_interaction'] = adult_df['age'] * adult_df['education-num']

# Aggregating less common categories into 'Other'
top_occupations = adult_df['occupation'].value_counts().nlargest(5).index
adult_df['occupation_aggregated'] = adult_df['occupation'].apply(lambda x: x if x in top_occupations else 'Other')

top_countries = adult_df['native-country'].value_counts().nlargest(5).index
adult_df['native_country_aggregated'] = adult_df['native-country'].apply(lambda x: x if x in top_countries else 'Other')

# Binning age and hours-per-week
age_bins = [0, 25, 35, 45, 55, 65, 100]
age_labels = ['0-25', '26-35', '36-45', '46-55', '56-65', '66+']
adult_df['age_binned'] = pd.cut(adult_df['age'], bins=age_bins, labels=age_labels, right=False)

hours_bins = [0, 20, 30, 40, 50, 100]
hours_labels = ['0-20', '21-30', '31-40', '41-50', '51+']
adult_df['hours_per_week_binned'] = pd.cut(adult_df['hours-per-week'], bins=hours_bins, labels=hours_labels, right=False)


# Aggregating less common categories into 'Other'
top_jobs = bank_df['job'].value_counts().nlargest(5).index
bank_df['job_aggregated'] = bank_df['job'].apply(lambda x: x if x in top_jobs else 'Other')

# Binning age and hours-per-week
age_bins = [0, 25, 35, 45, 55, 65, 75, 100]
age_labels = ['0-25', '26-35', '36-45', '46-55', '56-65', '66-75', '75+']
bank_df['age_binned'] = pd.cut(bank_df['age'], bins=age_bins, labels=age_labels, right=False)

bank_df = bank_df.drop(['age', 'job'], axis=1)

# Preprocess

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules

### Preprocessing Function ###
def preprocess(df):
    df_clean = df.dropna()
    cat_cols = df_clean.select_dtypes(include=['object', 'category']).columns
    df_clean = pd.get_dummies(df_clean, columns=cat_cols, drop_first=True)
    scaler = StandardScaler()
    num_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns
    df_clean[num_cols] = scaler.fit_transform(df_clean[num_cols])
    return df_clean

# Transaction Conversion

In [None]:
### Transaction Conversion Function ###
def dataframe_to_transactions(df):
    transactions = []
    for _, row in df.iterrows():
        transaction = [col for col in df.columns if row[col] == 1]
        transactions.append(transaction)
    return transactions

# DBSCAN

In [None]:
def estimate_dbscan_params(df, eps_values, min_samples_values):
    best_score = -1
    best_params = {'eps': None, 'min_samples': None}
    best_labels = None
    best_cluster_count = 0
    total_points = df.shape[0]
    max_noise_points = total_points * 0.10  # 10% of total points as maximum allowed noise

    for eps in eps_values:
        for min_samples in min_samples_values:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            labels = dbscan.fit_predict(df)
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            noise_points = np.count_nonzero(labels == -1)

            if n_clusters > 1 and noise_points <= max_noise_points:
                if -1 in labels:
                    silhouette = silhouette_score(df[labels != -1], labels[labels != -1])
                else:
                    silhouette = silhouette_score(df, labels)
                
                # Score adjusted to prioritize configurations with less noise
                score = 2 * silhouette + np.log1p(n_clusters) - np.log1p(noise_points)
                print(f"Testing eps={eps}, min_samples={min_samples}: silhouette={silhouette:.4f}, clusters={n_clusters}, noise={noise_points}")

                if score > best_score:
                    best_score = score
                    best_params = {'eps': eps, 'min_samples': min_samples}
                    best_labels = labels.copy()
                    best_cluster_count = n_clusters

    if best_params['eps']:
        print(f"Best Parameters: eps={best_params['eps']}, min_samples={best_params['min_samples']}, score={best_score:.4f}")
        print(f"Clusters found: {best_cluster_count}, Noise: {np.count_nonzero(best_labels == -1)}")
    else:
        print("No valid clustering configuration found.")
        best_labels = []

    return best_params, best_labels


# Analyze Clusters & FP-GROWTH

In [None]:
import seaborn as sns

### Cluster Analysis Function ###
def analyze_clusters(df, labels):
    df['Cluster'] = labels  
    # Visualizing the distribution of clusters
    plt.figure(figsize=(10, 6))
    sns.countplot(x=labels, palette='viridis')
    plt.title('Distribution of Clusters')
    plt.xlabel('Cluster Label')
    plt.ylabel('Number of Samples')
    plt.show()

    return 


### FP-Growth Association Rule Mining Function ###
def apply_fp_growth(transactions, min_support=0.1, min_confidence=0.3, min_lift=3):
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    rules = rules[rules['lift'] >= min_lift]
    return rules

### Rule Comparison and Filtering Function ###
def filter_redundant_rules(global_rules, cluster_rules):
    cluster_rules_filtered = cluster_rules[~cluster_rules['antecedents'].isin(global_rules['antecedents']) & ~cluster_rules['consequents'].isin(global_rules['consequents'])]
    return cluster_rules_filtered

# Plot Clusters

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def plot_clusters(df, labels):
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(df.drop(['Cluster'], axis=1))

    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(principal_components[:, 0], principal_components[:, 1], c=labels, cmap='viridis', label=labels)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title('2D PCA of Dataset')
    plt.colorbar(scatter)
    plt.show()




In [None]:
from sklearn.cluster import OPTICS
def opticsPlot(df, min_samples):
    optics_model = OPTICS(min_samples=min_samples, xi=0.05, min_cluster_size=0.05)
    optics_model.fit(df)
    reachability = optics_model.reachability_[optics_model.ordering_]
    labels = optics_model.labels_[optics_model.ordering_]
    plt.figure(figsize=(10, 7))
    plt.bar(range(len(reachability)), reachability)
    plt.title('Reachability Plot')
    plt.xlabel('Ordered Points')
    plt.ylabel('Reachability Distance')
    plt.show()


# Automated Pipeline

In [None]:
def automated_clustering_and_rule_mining(df, eps_values, min_samples_values, min_support, min_confidence, min_lift):
    processed_df = preprocess(df)
    best_params, best_labels = estimate_dbscan_params(processed_df, eps_values, min_samples_values)

    processed_df['Cluster'] = best_labels

    # Plotting the clusters
    plot_clusters(processed_df, processed_df['Cluster'])
    opticsPlot(processed_df, best_params['min_samples'])
    # Check if all labels are -1 (indicating all data points are considered noise)
    if (best_labels == -1).all():
        print("No valid clusters formed, all points are labeled as noise.")
        return {}, {}  # Return empty dictionaries if no valid clusters are formed

    analyze_clusters(processed_df, best_labels)
    
    transactions = dataframe_to_transactions(processed_df)
    initial_rules = apply_fp_growth(transactions, min_support, min_confidence, min_lift)
    
    cluster_specific_rules = {}
    for cluster_label in set(best_labels):
        if cluster_label == -1:  # Skip the noise cluster
            continue
        cluster_transactions = dataframe_to_transactions(processed_df[best_labels == cluster_label])
        cluster_rules = apply_fp_growth(cluster_transactions, min_support, min_confidence, min_lift)
        filtered_rules = filter_redundant_rules(initial_rules, cluster_rules)
        cluster_specific_rules[cluster_label] = filtered_rules
    
    return initial_rules, cluster_specific_rules, best_labels


# Example Usage

In [None]:
# Define the parameter range for DBSCAN
#eps_values = np.linspace(3, 5, 5)
#min_samples_values = range(110, 150, 4)  

eps_values = np.linspace(1, 2, 2)
min_samples_values = range(5, 10, 5)  

# Define thresholds for FP-Growth
min_support = 0.1
min_confidence = 0.3
min_lift = 3

# Run the pipeline
initial_rules, cluster_specific_rules, best_labels = automated_clustering_and_rule_mining(
    bank_df, eps_values, min_samples_values, min_support, min_confidence, min_lift
)

# Print the results
print("Initial Rules Found:")
print(initial_rules)

print("\nCluster Specific Rules:")
for cluster, rules in cluster_specific_rules.items():
    print(f"Rules for Cluster {cluster}:")
    print(rules)
    print("\n")

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import matplotlib.pyplot as plt

def plot_k_distance_graph(data, k):
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(data)
    distances, indices = neigh.kneighbors(data)
    
    sorted_distances = np.sort(distances[:, k-1], axis=0)
    plt.figure(figsize=(12, 6))
    plt.plot(sorted_distances)
    plt.title("K-distance Graph")
    plt.xlabel("Points sorted by distance")
    plt.ylabel(f"Distance to {k}-th nearest neighbor")
    plt.show()

# Example usage:
processed_data = preprocess(adult_df)  # Assuming preprocess is your function to standardize and encode the data
plot_k_distance_graph(processed_data, 43)  # k can be set to the min_samples you are considering
