In [69]:
import pandas as pd

adult_df = pd.read_csv('adult_prepared.csv')
adult_df.head(1)

Unnamed: 0,age,fnlwgt,education-num,hours-per-week,positive_capital_gain,positive_capital_loss,age_education_interaction,workclass_Federal-gov,workclass_Local-gov,workclass_Private,...,age_binned_26-35,age_binned_36-45,age_binned_46-55,age_binned_56-65,age_binned_66+,hours_per_week_binned_0-20,hours_per_week_binned_21-30,hours_per_week_binned_31-40,hours_per_week_binned_41-50,hours_per_week_binned_51+
0,0.30137,0.043338,0.8,0.397959,1.0,0.0,0.366642,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [65]:
file_path = 'adult.data'
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'income'
]
data = pd.read_csv(file_path, names=columns, na_values="?", skipinitialspace=True)

data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [63]:
def map_one_hot_to_transactions(row):
    transaction = []
    for col in adult_df.columns:
        # Check if the column is a binary indicator from one-hot encoding or a specially engineered feature
        if '_aggregated_' in col or '_binned_' in col or col.startswith('positive_') or col == 'age_education_interaction':
            # Feature is a specially engineered feature or a binned/aggregated category
            if row[col] == 1:
                transaction.append(col)  # Directly use column name as it's meaningful
        elif '_' in col:  # Standard one-hot encoded feature
            feature, value = col.rsplit('_', 1)  # Split on the last underscore
            if row[col] == 1:
                transaction.append(f"{feature}={value}")  # Recreate the original feature=value format
        else:
            # For numerical features that aren't part of one-hot encoding, you might decide based on your analysis
            # whether and how to include them in the transaction. For simplicity, they're excluded here.
            pass
    return transaction

# Apply the mapping to each row to create transactions
transactions_adult = adult_df.apply(map_one_hot_to_transactions, axis=1)


In [64]:
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder


# Instantiate the TransactionEncoder
encoder = TransactionEncoder()

# Fit and transform the transactions
encoded_transactions = encoder.fit(transactions_adult).transform(transactions_adult)

# Convert the encoded transactions into a DataFrame
transactions_df = pd.DataFrame(encoded_transactions, columns=encoder.columns_)

# Use FP-Growth to find frequent itemsets
frequent_itemsets = fpgrowth(transactions_df, min_support=0.05, use_colnames=True)

# Generate association rules from the frequent itemsets
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)

# Display the rules
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


                                             antecedents  \
0                                           (race=White)   
1              (native_country_aggregated_United-States)   
2                                             (sex=Male)   
3              (native_country_aggregated_United-States)   
4                                           (race=White)   
...                                                  ...   
17887     (occupation_aggregated_Craft-repair, sex=Male)   
17888  (marital-status=Married-civ-spouse, occupation...   
17889  (occupation_aggregated_Craft-repair, relations...   
17890  (occupation_aggregated_Craft-repair, native_co...   
17891               (occupation_aggregated_Craft-repair)   

                                             consequents   support  \
0              (native_country_aggregated_United-States)  0.802931   
1                                           (race=White)  0.802931   
2              (native_country_aggregated_United-States)  0.615742   

In [71]:
bank_df = pd.read_csv('bank.csv', delimiter=';')
bank_df.dropna()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


In [79]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Assuming df is your DataFrame after loading and dropping missing values

# Step 2: Aggregate categories (Example)
job_counts = bank_df['job'].value_counts()
top_jobs = job_counts[job_counts > job_counts.quantile(0.25)].index  # Keeping top 75%
bank_df['job_grouped'] = bank_df['job'].apply(lambda x: x if x in top_jobs else 'Other')

# Step 3: Binning (Example)
bank_df['age_binned'] = pd.cut(bank_df['age'], bins=[0, 30, 40, 50, 60, np.inf], labels=['0-30', '31-40', '41-50', '51-60', '60+'])

# DBSCAN Preparation: Standardize numerical and one-hot encode categorical
numerical_features = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
categorical_features = ['job_grouped', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])


# Fit your preprocessor to the data
X_dbscan = preprocessor.fit_transform(bank_df.drop(['y'], axis=1))

# Retrieve the column names
def get_feature_names(column_transformer):
    """Generate feature names from a ColumnTransformer."""
    output_features = []

    for name, transformer, orig_features in column_transformer.transformers_:
        # Skip 'remainder' transformer, if present
        if name == "remainder":
            continue

        if transformer == 'drop':
            continue

        # Handle transformers within a Pipeline
        if hasattr(transformer, 'named_steps'):
            # If the transformer is a pipeline, access the last step as the actual transformer
            transformer = transformer.named_steps[list(transformer.named_steps.keys())[-1]]
        
        # Extract feature names from OneHotEncoder
        if hasattr(transformer, 'categories_'):
            categories = transformer.categories_
            for orig_feature, cats in zip(orig_features, categories):
                output_features.extend([f"{orig_feature}={str(cat)}" for cat in cats])
        else:
            output_features.extend(orig_features)

    return output_features


# Get the list of new feature names
new_columns = get_feature_names(preprocessor)

# If you transform your data using the preprocessor, you can create a DataFrame with these new columns
X_dbscan_df = pd.DataFrame(X_dbscan, columns=new_columns) # Ensure X_dbscan is the result from preprocessor.transform


       age   balance  duration  campaign     pdays  previous  \
0 -1.05627  0.121072 -0.711861 -0.576829 -0.407218 -0.320413   

   job_grouped=Other  job_grouped=admin.  job_grouped=blue-collar  \
0                0.0                 0.0                      0.0   

   job_grouped=entrepreneur  ...  month=jun  month=mar  month=may  month=nov  \
0                       0.0  ...        0.0        0.0        0.0        0.0   

   month=oct  month=sep  poutcome=failure  poutcome=other  poutcome=success  \
0        1.0        0.0               0.0             0.0               0.0   

   poutcome=unknown  
0               1.0  

[1 rows x 48 columns]


In [75]:
X_dbscan[0]

array([-1.05626965,  0.12107186, -0.7118608 , -0.57682947, -0.4072183 ,
       -0.32041282,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  1.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        1.        ,  0.        ,  1.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ])