In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

df = pd.read_csv('/content/AWCustomers.csv')

def select_features(df):
    available_columns = df.columns.tolist()
    possible_features = [
        'MaritalStatus', 'Gender', 'YearlyIncome', 'TotalChildren',
        'NumberChildrenAtHome', 'Education', 'Occupation', 'HomeOwnerFlag',
        'NumberCarsOwned', 'CommuteDistance', 'BikeBuyer'
    ]
    selected_features = [col for col in possible_features if col in available_columns]
    return df[selected_features]

df_selected = select_features(df)

def preprocess_data(df):
    df = df.copy()

    df.fillna(df.mode().iloc[0], inplace=True)

    if 'YearlyIncome' in df.columns:
        df['YearlyIncome'] = pd.cut(df['YearlyIncome'], bins=5, labels=['Low', 'Medium-Low', 'Medium', 'Medium-High', 'High'])

    numeric_cols = [col for col in ['TotalChildren', 'NumberChildrenAtHome', 'NumberCarsOwned'] if col in df.columns]
    if numeric_cols:
        scaler = MinMaxScaler()
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
        std_scaler = StandardScaler()
        df[numeric_cols] = std_scaler.fit_transform(df[numeric_cols])

    categorical_cols = [col for col in ['MaritalStatus', 'Gender', 'Education', 'Occupation', 'CommuteDistance', 'YearlyIncome'] if col in df.columns]
    if categorical_cols:
        df = pd.get_dummies(df, columns=categorical_cols)

    return df

df_processed = preprocess_data(df_selected)

def calculate_similarity(df, idx1, idx2):
    obj1 = df.iloc[idx1].values.reshape(1, -1)
    obj2 = df.iloc[idx2].values.reshape(1, -1)

    simple_matching = np.mean(obj1 == obj2)
    jaccard = np.sum(np.minimum(obj1, obj2)) / np.sum(np.maximum(obj1, obj2))
    cosine = cosine_similarity(obj1, obj2)[0][0]

    return simple_matching, jaccard, cosine

def calculate_correlation(df):
    if 'CommuteDistance' in df.columns and 'YearlyIncome' in df.columns:
        label_encoder = LabelEncoder()
        df['CommuteDistance_encoded'] = label_encoder.fit_transform(df['CommuteDistance'])
        df['YearlyIncome_encoded'] = label_encoder.fit_transform(df['YearlyIncome'])
        corr, _ = pearsonr(df['CommuteDistance_encoded'], df['YearlyIncome_encoded'])
        return corr
    else:
        return None

similarity_results = calculate_similarity(df_processed, 0, 1) if len(df_processed) > 1 else None
correlation_result = calculate_correlation(df_selected) if 'CommuteDistance' in df_selected.columns and 'YearlyIncome' in df_selected.columns else None