In [None]:
import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
VIP_data = pd.read_csv("/kaggle/input/santandar-processed-data/fdata.csv")

Consider top 50,00,000 rows of VIP data only

In [None]:
import pandas as pd
VIP_data = VIP_data.sample(n=5000000, random_state=42)

In [None]:
from sklearn.preprocessing import LabelEncoder
date_columns = ['data_date', 'registration_date']
for col in date_columns:
    print(col)
    VIP_data[col] = pd.to_datetime(VIP_data[col])

In [None]:
for col in date_columns:
    print(col)
    VIP_data[col] = (VIP_data[col] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

In [None]:
categorical_columns = [
    'customer_relation_type_beginning_month', 
    'residence_index', 
    'foreigner_index', 
    'channel_used', 
    'province_name', 
    'segmentation'
]
label_encoders = {}
for col in categorical_columns:
    print(col)
    label_encoders[col] = LabelEncoder()
    VIP_data[col] = label_encoders[col].fit_transform(VIP_data[col].astype(str))

In [None]:
columns_to_remove = [
    'employee_index', 'country_residence', 'new_customer_index',
    'primary_customer', 'customer_type_beginning_month',
    'customer_relation_type_beginning_month', 'residence_index',
    'foreigner_index', 'registration_date'
]
VIP_data.drop(columns=columns_to_remove, inplace=True)

In [None]:
VIP_data.isnull().sum()

In [None]:
VIP_data_cleaned = VIP_data.dropna()

In [None]:
VIP_data_cleaned.isnull().sum()

In [None]:
columns_for_clustering = ['data_date', 'employee_index', 'country_residence', 'gender',
                          'registration_date', 'new_customer_index', 'seniority',
                          'primary_customer', 'customer_type_beginning_month',
                          'customer_relation_type_beginning_month', 'residence_index',
                          'foreigner_index', 'channel_used', 'province_code', 'province_name',
                          'activity_index', 'gross_income_household','segmentation','age_group']
print(f"no of cols for clustering: {len(columns_for_clustering)}")

cols_to_remove = ['employee_index', 'country_residence', 'new_customer_index', 
                  'primary_customer', 'customer_type_beginning_month', 
                  'customer_relation_type_beginning_month', 'residence_index', 
                  'foreigner_index', 'registration_date']
print(f"no of cols to remove: {len(cols_to_remove)}")

columns_for_clustering = [col for col in columns_for_clustering if col not in cols_to_remove]

# Display the updated list
print(f"cols for clustering: {columns_for_clustering}")
print(f"no of cols for clustering: {len(columns_for_clustering)}")

In [None]:
from sklearn.cluster import KMeans

additional_labels = ['prod_savings_account', 'prod_guarantees', 'prod_current_accounts', 
                     'prod_derivative_account', 'prod_payroll_account', 'prod_junior_account', 
                     'prod_mas_particular_account', 'prod_particular_account', 
                     'prod_particular_plus_account', 'prod_short_term_deposits', 
                     'prod_medium_term_deposits', 'prod_long_term_deposits', 
                     'prod_e_account', 'prod_funds', 'prod_mortgage', 'prod_pensions1', 
                     'prod_loans', 'prod_taxes', 'prod_credit_card', 'prod_securities', 
                     'prod_home_account', 'prod_payroll', 'prod_pensions2', 'prod_direct_debit']
print(f"number of products: {len(additional_labels)}")

In [None]:
X = VIP_data_cleaned

In [None]:
X_ = X[columns_for_clustering]
y = X[additional_labels]

In [None]:
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.2, random_state=42)

In [None]:
if isinstance(y_train, pd.Series):
    y_train = y_train.to_frame()


X1 = pd.concat([X_train, y_train], axis=1)

In [None]:
X1.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings("ignore")
wcss = []

k_range = range(1, 11)

for k in k_range:
    print(k,end="")
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X1)
    wcss.append(kmeans.inertia_)


plt.plot(k_range, wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.xticks(np.arange(1, 11, 1))
plt.grid(True)
plt.show()

In [None]:
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X1)

In [None]:
cluster_labels = kmeans.predict(X1)
X_train['cluster_labels'] = cluster_labels

In [None]:
X_train['cluster_labels'].value_counts()

In [None]:
import pandas as pd
random_row_dict = VIP_data.sample(n=1).to_dict(orient='records')
for key, value in random_row_dict[0].items():
    print(f"{key}: {value}")

In [None]:
X2 = pd.concat([X_test, y_test], axis=1)

In [None]:
X2.head()

In [None]:
# columns_to_sum = ['prod_savings_account', 'prod_guarantees', 'prod_current_accounts', 'prod_derivative_account',
#                   'prod_payroll_account', 'prod_junior_account', 'prod_mas_particular_account',
#                   'prod_particular_account', 'prod_particular_plus_account', 'prod_short_term_deposits',
#                   'prod_medium_term_deposits', 'prod_long_term_deposits', 'prod_e_account', 'prod_funds',
#                   'prod_mortgage', 'prod_pensions1', 'prod_loans', 'prod_taxes', 'prod_credit_card',
#                   'prod_securities', 'prod_home_account', 'prod_payroll', 'prod_pensions2', 'prod_direct_debit']


# for index, row in X2.iterrows():
#     row_sum = row[columns_to_sum].sum()
#     if row_sum > 4:
#         print("Row at index {} satisfies the condition:".format(index))
#         print(row)
#         break
# else:
#     print("No row found satisfying the condition.")


In [None]:
# import pandas as pd
# from tqdm import tqdm
# columns_to_sum = ['prod_savings_account', 'prod_guarantees', 'prod_current_accounts', 'prod_derivative_account',
#                   'prod_payroll_account', 'prod_junior_account', 'prod_mas_particular_account',
#                   'prod_particular_account', 'prod_particular_plus_account', 'prod_short_term_deposits',
#                   'prod_medium_term_deposits', 'prod_long_term_deposits', 'prod_e_account', 'prod_funds',
#                   'prod_mortgage', 'prod_pensions1', 'prod_loans', 'prod_taxes', 'prod_credit_card',
#                   'prod_securities', 'prod_home_account', 'prod_payroll', 'prod_pensions2', 'prod_direct_debit']

# satisfying_rows = []

# for index, row in tqdm(X2.iterrows(), total=len(X2)):
#     row_sum = row[columns_to_sum].sum()
#     if row_sum > 5:
#         satisfying_rows.append(row)
#     if index == 300000:
#         break
# df_satisfying = pd.DataFrame(satisfying_rows)

In [None]:
df_satisfying = pd.read_csv('/kaggle/input/df-satisfying/df_satisfying.csv')

In [None]:
cluster_labels = kmeans.predict(df_satisfying)
df_satisfying['cluster_labels'] = cluster_labels

In [None]:
df_satisfying.head()

In [None]:
df_satisfying['cluster_labels'].value_counts()

In [None]:
data = {
    'data_date': [1430179200.0],
    'gender': [0.0],
    'seniority': [200.0],
    'channel_used': [22.0],
    'province_code': [28.0],
    'province_name': [30.0],
    'activity_index': [1.000000e+00],
    'gross_income_household': [116833.62],
    'segmentation': [2.000000e+00],
    'age_group': [2.0]
}

new_user =  pd.DataFrame(data)

In [None]:
# rows_from_cluster_0 = df_satisfying[df_satisfying['cluster_labels'] == 0]

In [None]:
# row_from_cluster_0 = rows_from_cluster_0.sample(n=1)
# row_from_cluster_0.columns

In [None]:
# columns_to_keep = [
#     'data_date',
#     'gender',
#     'seniority',
#     'channel_used',
#     'province_code',
#     'province_name',
#     'activity_index',
#     'gross_income_household',
#     'segmentation',
#     'age_group'
# ]

In [None]:
# new_user = row_from_cluster_0[columns_to_keep]
# new_user

In [None]:
for col in new_user.columns:
    print(f"{col}: {new_user[col].iloc[0]}")

In [None]:
new_user_already_used = {
    "prod_payroll_account" : 1,
    "prod_pensions2" :1,
    "prod_payroll":1,
    "prod_pensions1":1
}

In [None]:
new_user_product_values = {label: new_user_already_used.get(label, 0) for label in additional_labels}

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
for col, val in new_user_product_values.items():
    new_user[col] = val

In [None]:
new_user.head()

In [None]:
new_user.columns

In [None]:
cluster_labels = kmeans.predict(new_user)

In [None]:
similar_users = X_train[X_train['cluster_labels'] == cluster_labels[0]]

In [None]:
similar_users.head()

In [None]:
similar_users["cluster_labels"].value_counts()

In [None]:
similar_users_X = X.loc[similar_users.index]

In [None]:
similar_users_X.columns

In [None]:
similar_user_columns = ['data_date', 'gender', 'seniority', 'channel_used', 'province_code',
       'province_name', 'activity_index', 'gross_income_household',
       'segmentation', 'prod_savings_account', 'prod_guarantees',
       'prod_current_accounts', 'prod_derivative_account',
       'prod_payroll_account', 'prod_junior_account',
       'prod_mas_particular_account', 'prod_particular_account',
       'prod_particular_plus_account', 'prod_short_term_deposits',
       'prod_medium_term_deposits', 'prod_long_term_deposits',
       'prod_e_account', 'prod_funds', 'prod_mortgage', 'prod_pensions1',
       'prod_loans', 'prod_taxes', 'prod_credit_card', 'prod_securities',
       'prod_home_account', 'prod_payroll', 'prod_pensions2',
       'prod_direct_debit', 'age_group']

new_user_reordered = new_user[similar_user_columns]

In [None]:
new_user_reordered.columns

# Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between the new user and each row of similar_users_X
cosine_similarities = cosine_similarity(new_user.values.reshape(1, -1), similar_users_X.values)

# Flatten the cosine_similarities array
cosine_similarities_flat = cosine_similarities.flatten()

# Get the indices of rows in similar_users_X sorted based on cosine similarity scores
sorted_indices = cosine_similarities_flat.argsort()[::-1]

# Get the sorted cosine similarity scores
sorted_cosine_similarities = cosine_similarities_flat[sorted_indices]


top_similar_users_indices = sorted_indices[:100]
top_cosine_similarity_scores = sorted_cosine_similarities[:100]


for i, (index, cosine_sim) in enumerate(zip(top_similar_users_indices, top_cosine_similarity_scores), 1):
    print(f"Top {i}: Row Index = {index}, Cosine Similarity Score = {cosine_sim}")


In [None]:
top_similar_users_indices = sorted_indices[:100]

top_similar_users = similar_users_X.iloc[top_similar_users_indices]

In [None]:
top_similar_users

In [None]:
product_columns = ['prod_savings_account', 'prod_guarantees', 'prod_current_accounts', 
                   'prod_derivative_account', 'prod_payroll_account', 'prod_junior_account', 
                   'prod_mas_particular_account', 'prod_particular_account', 
                   'prod_particular_plus_account', 'prod_short_term_deposits', 
                   'prod_medium_term_deposits', 'prod_long_term_deposits', 
                   'prod_e_account', 'prod_funds', 'prod_mortgage', 'prod_pensions1', 
                   'prod_loans', 'prod_taxes', 'prod_credit_card', 'prod_securities', 
                   'prod_home_account', 'prod_payroll', 'prod_pensions2', 'prod_direct_debit']


column_scores = {col: 0 for col in product_columns}
for index, cosine_similarity_score in zip(top_similar_users_indices, top_cosine_similarity_scores):
    row = similar_users_X.iloc[index]
    for col in product_columns:
        score = row[col] * cosine_similarity_score
        column_scores[col] += score
for col, score in column_scores.items():
    print(f"Column: {col}, Score: {score}")

In [None]:
items = sorted(column_scores.items(),key = lambda x: x[1],reverse=True)
for col, score in items[:15]:
    if col in new_user_already_used or score <=0:
        continue
    print(f"Column: {col}, Score: {score}")

# Pairwise distance

In [None]:
from sklearn.metrics import pairwise_distances

# Compute pairwise distances between the new user and each row of similar_users_X
distances = pairwise_distances(new_user.values.reshape(1, -1), similar_users_X.values, metric='euclidean')

# Flatten the distances array
distances_flat = distances.flatten()

# Get the indices of rows in similar_users_X sorted based on distances
sorted_indices = distances_flat.argsort()

# Get the sorted distances
sorted_distances = distances_flat[sorted_indices]

# Get the row indices and distances of the top similar users
top_similar_users_indices = sorted_indices[:100]
top_distances = sorted_distances[:100]

column_scores_d = {col: 0 for col in product_columns}


for index, distance in zip(top_similar_users_indices, top_distances):
    row = similar_users_X.iloc[index]
    for col in product_columns:
        score = row[col] / (distance + 1e-6)
        column_scores_d[col] += score

for col, score in column_scores_d.items():
    print(f"Column: {col}, Score: {score}")


In [None]:
# Print the scores for each column
items = sorted(column_scores_d.items(),key = lambda x: x[1],reverse=True)
for col, score in items[:15]:
    if col in new_user_already_used or score <=0:
        continue
    print(f"Column: {col}, Score: {score}")

# Category Based Recommendation

In [None]:
bank_products = {
    "Accounts and Deposits": [
        "prod_savings_account",
        "prod_junior_account",
        "prod_mas_particular_account",
        "prod_particular_account",
        "prod_particular_plus_account",
        "prod_home_account",
        "prod_e_account",
        "prod_current_accounts",
        "prod_payroll_account"
    ],
    "Investment Products": [
        "prod_funds",
        "prod_securities",
        "prod_derivative_account",
        "prod_short_term_deposits",
        "prod_medium_term_deposits",
        "prod_long_term_deposits"
    ],
    "Loans and Financing": [
        "prod_mortgage",
        "prod_loans",
        "prod_credit_card"
    ],
    "Pensions and Retirement": [
        "prod_pensions1",
        "prod_pensions2"
    ],
    "Payment Services": [
        "prod_payroll",
        "prod_direct_debit"
    ],
    "Other Financial Products": [
        "prod_guarantees",
        "prod_taxes"
    ]
}

In [None]:
def get_product_genres(products):
    genres = set()
    for product in products:
        for genre, products_list in bank_products.items():
            if product in products_list:
                genres.add(genre)
    return genres

In [None]:
products = [prod for prod in new_user_already_used]
genres = get_product_genres(products)
print(genres)

In [None]:
selected_cols = []

items = sorted(column_scores_d.items(),key = lambda x: x[1],reverse=True)
for col, score in items[:15]:
    
    if col not in new_user_already_used and score > 0:
        selected_cols.append(col)
genres_rec_pd = get_product_genres(selected_cols)
print(genres_rec_pd)

In [None]:
selected_cols = []

items = sorted(column_scores.items(),key = lambda x: x[1],reverse=True)
for col, score in items[:15]:
    
    if col not in new_user_already_used and score > 0:
        selected_cols.append(col)
genres_rec_cs = get_product_genres(selected_cols)
print(genres_rec_cs)

# Category Jaccard Similarity

## Actual vs PairWise Distance

In [None]:
set1 = genres
set2 = genres_rec_pd
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
jaccard_similarity = intersection / union
print("Jaccard Similarity Score:", jaccard_similarity)

## Actual vs Cosine Similarity

In [None]:
set1 = genres
set2 = genres_rec_cs
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
jaccard_similarity = intersection / union
print("Jaccard Similarity Score:", jaccard_similarity)

# Association Rule Mining

In [None]:
X1.columns

In [None]:
X1.info()

In [None]:
bank_service_cols= ['prod_savings_account', 'prod_guarantees', 'prod_current_accounts', 
                   'prod_derivative_account', 'prod_payroll_account', 'prod_junior_account', 
                   'prod_mas_particular_account', 'prod_particular_account', 
                   'prod_particular_plus_account', 'prod_short_term_deposits', 
                   'prod_medium_term_deposits', 'prod_long_term_deposits', 
                   'prod_e_account', 'prod_funds', 'prod_mortgage', 'prod_pensions1', 
                   'prod_loans', 'prod_taxes', 'prod_credit_card', 'prod_securities', 
                   'prod_home_account', 'prod_payroll', 'prod_pensions2', 'prod_direct_debit']

In [None]:
X1[bank_service_cols] = X1[bank_service_cols].astype(bool)

In [None]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
frequent_itemsets = apriori(X1[bank_service_cols], min_support=0.01, use_colnames=True)

In [None]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)

# Display the association rules
rules

In [None]:
def search_association_rules(association_rules_df, input_products, min_support=0.1, min_confidence=0.5, min_lift=1.0):
    # Initialize an empty list to store the search results
    search_results = []

    # Iterate through each row of the association rules DataFrame
    for index, row in association_rules_df.iterrows():
        # Check if any product from the input list is in the antecedents of the rule
        if any(product in row['antecedents'] for product in input_products):
            # Check if the rule meets the specified threshold conditions
            if row['support'] >= min_support and row['confidence'] >= min_confidence and row['lift'] >= min_lift:
                # Add the entire row to the search results list
                search_results.append(row)

    # Concatenate the list of search results into a DataFrame
    search_results_df = pd.DataFrame(search_results)

    return search_results_df

In [None]:
consequents = search_association_rules(rules, ["prod_payroll_account",
    "prod_pensions2",
    "prod_payroll", "prod_pensions1"],0.02,0.1, 5.0)
consequents

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

def plot_association_rules_graph(association_rules_df):

    # Sort association rules DataFrame based on confidence in decreasing order
    association_rules_df = association_rules_df.sort_values(by='confidence', ascending=False)

    # Select top 10 rows
    top_10_rules_df = association_rules_df.head(10)

    # Create a directed graph
    G = nx.DiGraph()

    # Add edges to the graph based on top 10 association rules
    for _, row in top_10_rules_df.iterrows():
        antecedents = row['antecedents']
        consequents = row['consequents']
        support = round(row['support'], 3)  # Round off support to 3 decimal precision
        for antecedent in antecedents:
            for consequent in consequents:
                # Add edge with support as weight
                G.add_edge(antecedent, consequent, weight=support)

    # Plot the graph
    pos = nx.spring_layout(G)
    nx.draw(G, pos, with_labels=True, node_size=2000, node_color='skyblue', font_size=7, font_weight='bold')
    # Draw edge labels
    edge_labels = nx.get_edge_attributes(G, 'weight')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
    plt.title('Association Rules Graph')
    plt.show()


In [None]:
plot_association_rules_graph(consequents)