In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Load dataset
df = pd.read_csv('CLEAN_IMPACT OF ONLINE GAMBLING ON THE SOCIO-ECONOMIC AND PSYCHOLOGICAL WELL-BEING OF YOUNG ADULTS (Responses) (1).xlsx - Sheet1.csv')

# Encoding Mappings
freq_map = {'Never': 0, 'Rarely': 1, 'Sometimes': 2, 'Often': 3, 'Always': 4}
conf_map = {'Not at all': 0, 'Slightly Confident': 1, 'Somewhat Confident': 2, 'Confident': 3, 'Very Confident': 4}
impact_map = {'Not at all': 0, 'Slightly': 1, 'Moderate': 2, 'Moderately': 2, 'Significantly': 3, 'Very Significantly': 4}
gambling_freq_map = {'Occasionally': 0, 'Monthly': 1, 'Weekly': 2, 'Daily': 3}
spend_map = {'Below Rs. 5,000': 0, 'Rs. 5,000 - Rs. 10,000': 1, 'Rs. 10,000 - Rs. 20,000': 2, 'Above Rs. 20,000': 3}

# Apply Mappings
df_proc = df.copy()
numeric_cols = df.columns[7:] # Survey questions
for col in numeric_cols:
    if "often" in col.lower() or "how" in col.lower():
        df_proc[col] = df_proc[col].map(freq_map).fillna(0)
df_proc["How often do you gamble?"] = df_proc["How often do you gamble?"].map(gambling_freq_map).fillna(0)
df_proc["How much money do you typically spend on gambling every month ?"] = df_proc["How much money do you typically spend on gambling every month ?"].map(spend_map).fillna(0)

# Clustering
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_proc.select_dtypes(include=[np.number]))
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10).fit(scaled_data)
df_proc['Cluster'] = kmeans.labels_

# Map Cluster
cluster_map = {0: 'Occasional Gamblers', 1: 'Financially Affected', 2: 'High Risk Gamblers', 3: 'Older Controlled'}
df_proc['Cluster_Name'] = df_proc['Cluster'].map(cluster_map)

inertia, sil = [], []
for k in range(2, 11):
    km = KMeans(n_clusters=k, random_state=42, n_init=10).fit(scaled_data)
    inertia.append(km.inertia_)
    sil.append(silhouette_score(scaled_data, km.labels_))

fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].plot(range(2, 11), inertia, 'bo-'); ax[0].set_title('Elbow Method'); ax[0].set_xlabel('K')
ax[1].plot(range(2, 11), sil, 'ro-'); ax[1].set_title('Silhouette Score'); ax[1].set_xlabel('K')
plt.show()

pca = PCA(n_components=2).fit_transform(scaled_data)
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca[:,0], y=pca[:,1], hue=df_proc['Cluster_Name'], palette='tab10', s=100)
plt.title('Fig 4: PCA Plot - Cluster Separation')
plt.legend(bbox_to_anchor=(1, 1))
plt.show()

# Select key indicators for the heatmap
summary_cols = ['How often do you gamble?', 'How much money do you typically spend on gambling every month ?', 
                'How often do you experience anxiety or depression related to your online gambling?']
heatmap_data = df_proc.groupby('Cluster_Name')[summary_cols].mean()

plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data, annot=True, cmap='YlOrRd')
plt.title('Fig 6: Heatmap of Cluster Behaviors')
plt.show()