In [1]:
# -----------------------------
# 1. Import Libraries
# -----------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# -----------------------------
# 2. Load Dataset
# -----------------------------
df = pd.read_csv("03_Clustering_Marketing.csv")

print("Dataset shape:", df.shape)
print(df.head())

# -----------------------------
# 3. Select Required Features
# -----------------------------
selected_features = [
    'gradyear', 'gender', 'NumberOffriends',
    'basketball', 'football', 'soccer', 'softball',
    'volleyball', 'swimming'
]

X = df[selected_features].copy()

# -----------------------------
# 4. Convert Gender (M/F → 0/1)
# -----------------------------
X['gender'] = X['gender'].map({'M': 0, 'F': 1})
X['gender'] = X['gender'].fillna(X['gender'].mode().iloc[0])

# -----------------------------
# 5. Handle Missing Values
# -----------------------------
num_cols = X.select_dtypes(include=np.number).columns
X[num_cols] = X[num_cols].fillna(X[num_cols].mean())

# -----------------------------
# 6. Scale Numeric Features Only
# -----------------------------
scaler = StandardScaler()

numeric_features = X.drop(columns=['gender'])
numeric_scaled = scaler.fit_transform(numeric_features)

# Combine scaled numeric + gender
X_scaled = np.column_stack([numeric_scaled, X['gender']])
X_scaled_df = pd.DataFrame(
    X_scaled,
    columns=numeric_features.columns.tolist() + ['gender']
)

print("Data size for clustering:", X_scaled_df.shape)

# -----------------------------
# 7. Elbow Method (Visualization Only)
# -----------------------------
wcss = []
K = range(1, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled_df)
    wcss.append(kmeans.inertia_)

plt.plot(K, wcss, marker='o')
plt.xlabel("Number of clusters (K)")
plt.ylabel("WCSS")
plt.title("Elbow Method")
plt.show()

# -----------------------------
# 8. Set Best K Manually
# -----------------------------
best_k = 2
print("Selected Best K:", best_k)

# -----------------------------
# 9. Apply K-Means with K = 6
# -----------------------------
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled_df)

# -----------------------------
# 10. Attach Cluster Labels
# -----------------------------
X_clustered = X.copy()
X_clustered['Cluster'] = clusters

# -----------------------------
# 11. Cluster Analysis
# -----------------------------
print("\nCluster-wise Mean Values:")
print(X_clustered.groupby('Cluster').mean())

# -----------------------------
# 12. Silhouette Score
# -----------------------------
sil_score = silhouette_score(X_scaled_df, clusters)
print(f"Silhouette Score for K={best_k}: {sil_score:.4f}")

# -----------------------------
# 13. PCA Visualization (2D)
# -----------------------------
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled_df)

plt.figure(figsize=(8, 6))
plt.scatter(
    X_pca[:, 0],
    X_pca[:, 1],
    c=clusters,
    cmap='viridis',
    s=50,
    alpha=0.6
)
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("K-Means Clustering (K=2|)")
plt.colorbar(label="Cluster")
plt.show()


PermissionError: [Errno 13] Permission denied: '03_Clustering_Marketing.csv'

In [3]:
# -----------------------------
# 1. Import Libraries
# -----------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score

# -----------------------------
# 2. Load Dataset
# -----------------------------
df = pd.read_csv("/content/03_Clustering_Marketing.csv")
print("Dataset shape:", df.shape)
print(df.head())

# -----------------------------
# 3. Select Required Features
# -----------------------------
selected_features = [
    'gradyear', 'gender', 'NumberOffriends',
    'basketball', 'football', 'soccer', 'softball',
    'volleyball', 'swimming'
]

X = df[selected_features].copy()

# -----------------------------
# 4. Convert Gender (F/M → 1/0)
# -----------------------------
X['gender'] = X['gender'].map({'M': 0, 'F': 1})
X['gender'] = X['gender'].fillna(X['gender'].mode().iloc[0])

# -----------------------------
# 5. Clean and Handle Missing Values
# -----------------------------
num_cols = X.select_dtypes(include=np.number).columns
X[num_cols] = X[num_cols].fillna(X[num_cols].mean())

# -----------------------------
# 6. Scale the Data
# -----------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# -----------------------------
# 7. Apply PCA (for clustering)
# -----------------------------
pca = PCA(n_components=5, random_state=42)
X_pca = pca.fit_transform(X_scaled_df)
print(X_pca.explained_variance_ratio_)
print("Total variance explained:", pca.explained_variance_ratio_.sum())

# -----------------------------
# 8. Apply K-Means
# -----------------------------
best_k = 4
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_pca)

# -----------------------------
# 9. Attach Cluster Labels
# -----------------------------
X_clustered = X.copy()
X_clustered['Cluster'] = clusters

# -----------------------------
# 10. Silhouette Score
# -----------------------------
sil_score = silhouette_score(X_pca, clusters)
print(f"Silhouette Score for K={best_k}: {sil_score:.4f}")

# -----------------------------
# 11. Analyze Clusters
# -----------------------------
print("\nCluster-wise Mean Values:")
print(X_clustered.groupby('Cluster').mean())

# -----------------------------
# 12. t-SNE Visualization
# -----------------------------
tsne = TSNE(n_components=2, random_state=42, perplexity=50)
X_tsne = tsne.fit_transform(X_scaled_df)

plt.figure(figsize=(8,6))
plt.scatter(
    X_tsne[:,0], X_tsne[:,1],
    c=clusters,
    cmap='viridis',
    s=50,
    alpha=0.6
)
plt.title(f"K-Means Clusters (K={best_k}) with t-SNE")
plt.xlabel("t-SNE 1")
plt.ylabel("t-SNE 2")
plt.colorbar(label='Cluster')
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: '/content/03_Clustering_Marketing.csv'

NameError: name 'pca' is not defined