In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

CSV_PATH = "AWCustomers.csv"
PAIR_IDX = (0, 1)

df_raw = pd.read_csv(CSV_PATH, low_memory=False)

drop_if_present = [
    'CustomerID','Title','FirstName','MiddleName','LastName','Suffix',
    'AddressLine1','AddressLine2','City','StateProvinceName','StateProvince',
    'CountryRegionName','CountryRegion','PostalCode','PhoneNumber','EmailAddress',
    'BirthDate','DateFirstPurchase','CompanyName','GeographyKey','RegionKey',
    'MaritalStatusLabel','GenderLabel','EducationLevelLabel','OccupationLabel'
]

candidate_features = [
    'Age','YearlyIncome','AveMonthSpend','NumberCarsOwned','TotalChildren',
    'NumberChildrenAtHome','Gender','MaritalStatus','HomeOwnerFlag','HouseOwnerFlag',
    'Occupation','EnglishOccupation','Education','EnglishEducation','Region',
    'CommuteDistance','BikeBuyer'
]

present_candidates = [c for c in candidate_features if c in df_raw.columns]
df = df_raw.drop(columns=[c for c in drop_if_present if c in df_raw.columns], errors='ignore')
if present_candidates:
    df = df[[c for c in present_candidates if c in df.columns]]

target_col = 'BikeBuyer' if 'BikeBuyer' in df.columns else None
if target_col is not None:
    y = df[target_col].copy()
    X = df.drop(columns=[target_col])
else:
    y = None
    X = df.copy()

numeric_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
categorical_cols = [c for c in X.columns if c not in numeric_cols]

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

X_num = pd.DataFrame(num_imputer.fit_transform(X[numeric_cols]), columns=numeric_cols, index=X.index) if numeric_cols else pd.DataFrame(index=X.index)
X_cat = pd.DataFrame(cat_imputer.fit_transform(X[categorical_cols]), columns=categorical_cols, index=X.index) if categorical_cols else pd.DataFrame(index=X.index)
X_imputed = pd.concat([X_num, X_cat], axis=1)

minmax_scaler = MinMaxScaler()
std_scaler = StandardScaler()

if numeric_cols:
    X_num_minmax = pd.DataFrame(minmax_scaler.fit_transform(X_imputed[numeric_cols]), columns=[f"{c}__minmax" for c in numeric_cols], index=X.index)
    X_num_std = pd.DataFrame(std_scaler.fit_transform(X_imputed[numeric_cols]), columns=[f"{c}__std" for c in numeric_cols], index=X.index)
else:
    X_num_minmax = pd.DataFrame(index=X.index)
    X_num_std = pd.DataFrame(index=X.index)

X_disc = pd.DataFrame(index=X.index)

def safe_qcut(s, q=4, labels=None):
    try:
        return pd.qcut(s, q=q, duplicates='drop', labels=labels)
    except Exception:
        bins = np.linspace(s.min(), s.max(), num=min(q, len(np.unique(s))) + 1)
        bins = np.unique(bins)
        if len(bins) > 1:
            return pd.cut(s, bins=bins, labels=labels[:len(bins)-1] if labels else None, include_lowest=True)
        else:
            return pd.Series(['bin_0'] * len(s), index=s.index)

if 'Age' in numeric_cols:
    X_disc['Age_bin'] = safe_qcut(X_imputed['Age'], q=4, labels=['Q1','Q2','Q3','Q4'])

if 'YearlyIncome' in numeric_cols:
    X_disc['YearlyIncome_bin'] = safe_qcut(X_imputed['YearlyIncome'], q=5, labels=['P20','P40','P60','P80','P100'])

if 'CommuteDistance' in numeric_cols:
    X_disc['CommuteDistance_bin'] = pd.cut(X_imputed['CommuteDistance'], bins=5, labels=['B1','B2','B3','B4','B5'], include_lowest=True)

categoricals_to_encode = categorical_cols + [c for c in X_disc.columns if c.endswith('_bin')]
if categoricals_to_encode:
    def cap_rare_levels(series, min_count=20):
        vc = series.value_counts(dropna=False)
        rare = vc[vc < min_count].index
        return series.where(~series.isin(rare), other='Other')

    X_cat_for_ohe = X_imputed[categorical_cols].copy() if categorical_cols else pd.DataFrame(index=X.index)
    for c in X_cat_for_ohe.columns:
        X_cat_for_ohe[c] = cap_rare_levels(X_cat_for_ohe[c])

    X_disc_for_ohe = X_disc.copy()
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_ohe = pd.DataFrame(
        ohe.fit_transform(pd.concat([X_cat_for_ohe, X_disc_for_ohe], axis=1)),
        columns=ohe.get_feature_names_out(list(X_cat_for_ohe.columns) + list(X_disc_for_ohe.columns)),
        index=X.index
    )
else:
    X_ohe = pd.DataFrame(index=X.index)

X_minmax_plus_cat = pd.concat([X_num_minmax, X_imputed[categorical_cols]], axis=1)
X_std_plus_ohe = pd.concat([X_num_std, X_ohe], axis=1)

X_sim = X_std_plus_ohe.dropna(axis=0, how='any')
i, j = PAIR_IDX
if j >= len(X_sim) or i >= len(X_sim):
    i, j = 0, 1

row_i = X_sim.iloc[[i]].to_numpy()
row_j = X_sim.iloc[[j]].to_numpy()

cos_sim = float(cosine_similarity(row_i, row_j)[0,0])

X_bin = X_std_plus_ohe.copy()
for c in X_num_std.columns:
    X_bin[c] = (X_bin[c] > 0).astype(int)

bi = X_bin.iloc[i].to_numpy().astype(int)
bj = X_bin.iloc[j].to_numpy().astype(int)

matches = np.sum(bi == bj)
smc = matches / bi.size
intersection = np.sum((bi == 1) & (bj == 1))
union = np.sum((bi == 1) | (bj == 1))
jaccard = (intersection / union) if union != 0 else 0.0

print(f"Simple Matching Coefficient (SMC): {smc:.4f}")
print(f"Jaccard Similarity: {jaccard:.4f}")
print(f"Cosine Similarity: {cos_sim:.4f}")

def map_commute_distance_to_miles(series):
    mapping = {
        '0-1 Miles': 0.5,'1-2 Miles': 1.5,'2-5 Miles': 3.5,'5-10 Miles': 7.5,'10+ Miles': 15.0,
        '0-5 Miles': 2.5,'1-5 Miles': 3.0
    }
    if pd.api.types.is_numeric_dtype(series):
        return series.astype(float)
    else:
        s = series.astype(str).str.strip()
        out = s.map(mapping)
        mask_na = out.isna()
        if mask_na.any():
            tmp = s[mask_na].str.replace(' Miles','', regex=False).str.replace('mile','', regex=False)
            parsed = []
            for val in tmp:
                val = val.replace('+','')
                if '-' in val:
                    try:
                        a,b = val.split('-')
                        parsed.append((float(a)+float(b))/2.0)
                    except:
                        parsed.append(np.nan)
                else:
                    try:
                        parsed.append(float(val))
                    except:
                        parsed.append(np.nan)
            out.loc[mask_na] = parsed
        return pd.to_numeric(out, errors='coerce')

commute_col = None
for c in df.columns:
    if "Commute" in c or "Distance" in c:
        commute_col = c
        break

income_col = None
for c in df.columns:
    if "Income" in c:
        income_col = c
        break

if commute_col and income_col:
    commute_series = X_imputed[commute_col] if commute_col in X_imputed.columns else df[commute_col]
    income_series = X_imputed[income_col] if income_col in X_imputed.columns else df[income_col]

    commute_numeric = map_commute_distance_to_miles(commute_series)
    income_numeric = pd.to_numeric(income_series, errors='coerce')

    corr_df = pd.DataFrame({'CommuteMiles': commute_numeric, 'YearlyIncome': income_numeric}).dropna()
    if len(corr_df) >= 2:
        pearson_corr = corr_df['CommuteMiles'].corr(corr_df['YearlyIncome'], method='pearson')
        spearman_corr = corr_df['CommuteMiles'].corr(corr_df['YearlyIncome'], method='spearman')
        print(f"Pearson correlation: {pearson_corr:.4f}")
        print(f"Spearman correlation: {spearman_corr:.4f}")
    else:
        print("Not enough valid rows to compute correlation.")
else:
    print("Commute distance column not found in dataset. Correlation cannot be computed.")

X_minmax_plus_cat.to_csv("processed_minmax_plus_cat.csv", index=False)
X_std_plus_ohe.to_csv("processed_std_plus_ohe.csv", index=False)
if target_col is not None:
    y.to_csv("target_BikeBuyer.csv", index=False)

Simple Matching Coefficient (SMC): 0.8750
Jaccard Similarity: 0.7273
Cosine Similarity: 0.6065
Commute distance column not found in dataset. Correlation cannot be computed.
