# Week 9 - Facebook Live Selling Analysis
## Clustering Analysis using KMeans and Agglomerative Clustering

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

## Step 1: Load the Dataset

In [2]:
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00488/Live_20210128.csv"
df = pd.read_csv(url)

# Display basic information
print("Dataset loaded successfully!")
print(f"\nQ1: Shape of the data: {df.shape}")
print(f"\nDataset Info:")
print(df.info())

Dataset loaded successfully!

Q1: Shape of the data: (7050, 16)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7050 entries, 0 to 7049
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   status_id         7050 non-null   int64  
 1   status_type       7050 non-null   object 
 2   status_published  7050 non-null   object 
 3   num_reactions     7050 non-null   int64  
 4   num_comments      7050 non-null   int64  
 5   num_shares        7050 non-null   int64  
 6   num_likes         7050 non-null   int64  
 7   num_loves         7050 non-null   int64  
 8   num_wows          7050 non-null   int64  
 9   num_hahas         7050 non-null   int64  
 10  num_sads          7050 non-null   int64  
 11  num_angrys        7050 non-null   int64  
 12  Column1           0 non-null      float64
 13  Column2           0 non-null      float64
 14  Column3           0 non-null      float64
 15  Column4   

## Step 2: Check for Null Values

In [3]:
# Check for null values
null_counts = df.isnull().sum()
features_with_nulls = null_counts[null_counts > 0]

print("Q2: Features with Null values:")
print(features_with_nulls)
print(f"\nNumber of features containing Null values: {len(features_with_nulls)}")

Q2: Features with Null values:
Column1    7050
Column2    7050
Column3    7050
Column4    7050
dtype: int64

Number of features containing Null values: 4


## Step 3: Check Unique Values in status_type

In [4]:
# Check unique values in status_type
unique_status_types = df['status_type'].nunique()
print(f"Q3: Number of unique values in 'status_type': {unique_status_types}")
print(f"\nUnique status types: {df['status_type'].unique()}")

Q3: Number of unique values in 'status_type': 4

Unique status types: ['video' 'photo' 'link' 'status']


## Step 4: Data Preprocessing

In [5]:
# Drop all features which have Null values
df_cleaned = df.dropna(axis=1)
print(f"Shape after dropping null columns: {df_cleaned.shape}")

# Save 'status_type' as target variable
y = df_cleaned['status_type'].copy()

# Drop the features "status_id", "status_type" and "status_published"
columns_to_drop = ["status_id", "status_type", "status_published"]
X = df_cleaned.drop(columns=columns_to_drop, errors='ignore')

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures: {X.columns.tolist()}")

Shape after dropping null columns: (7050, 12)

Features shape: (7050, 9)
Target shape: (7050,)

Features: ['num_reactions', 'num_comments', 'num_shares', 'num_likes', 'num_loves', 'num_wows', 'num_hahas', 'num_sads', 'num_angrys']


In [6]:
# Use LabelEncoder to transform the target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Label Encoding:")
print(f"Original classes: {le.classes_}")
print(f"Encoded target shape: {y_encoded.shape}")
print(f"Unique encoded values: {np.unique(y_encoded)}")

Label Encoding:
Original classes: ['link' 'photo' 'status' 'video']
Encoded target shape: (7050,)
Unique encoded values: [0 1 2 3]


In [7]:
# Use StandardScaler to scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Scaled features shape: {X_scaled.shape}")
print(f"Mean after scaling (should be ~0): {X_scaled.mean():.6f}")
print(f"Std after scaling (should be ~1): {X_scaled.std():.6f}")

Scaled features shape: (7050, 9)
Mean after scaling (should be ~0): 0.000000
Std after scaling (should be ~1): 1.000000


## Step 5: KMeans Clustering (Q4 & Q5)

In [8]:
# Q4: KMeans with k=2
kmeans_k2 = KMeans(n_clusters=2, random_state=10)
kmeans_k2.fit(X_scaled)

inertia_k2 = kmeans_k2.inertia_
print(f"Q4: Inertia score at k=2: {inertia_k2:.2f}")

# KMeans with k=4
kmeans_k4 = KMeans(n_clusters=4, random_state=10)
kmeans_k4.fit(X_scaled)
y_pred_kmeans = kmeans_k4.predict(X_scaled)

inertia_k4 = kmeans_k4.inertia_
print(f"\nInertia score at k=4: {inertia_k4:.2f}")

Q4: Inertia score at k=2: 48802.48

Inertia score at k=4: 32469.79


In [9]:
# Q5: Check accuracy for KMeans k=4
# Note: For clustering, we need to check if predicted labels match the pattern in actual labels
# We'll calculate accuracy by finding the best label mapping

from scipy.optimize import linear_sum_assignment

def cluster_accuracy(y_true, y_pred):
    """Calculate clustering accuracy by finding best label assignment"""
    # Create confusion matrix
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    assert y_pred.size == y_true.size
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    
    # Find optimal assignment
    row_ind, col_ind = linear_sum_assignment(w.max() - w)
    
    # Calculate accuracy
    accuracy = w[row_ind, col_ind].sum() / y_pred.size
    
    return accuracy, dict(zip(row_ind, col_ind))

accuracy_kmeans, mapping = cluster_accuracy(y_encoded, y_pred_kmeans)
print(f"Q5: KMeans k=4 Accuracy (with optimal mapping): {accuracy_kmeans:.4f}")
print(f"Correctly predicted labels: {int(accuracy_kmeans * len(y_encoded))} out of {len(y_encoded)}")

Q5: KMeans k=4 Accuracy (with optimal mapping): 0.6350
Correctly predicted labels: 4477 out of 7050


## Step 6: Agglomerative Clustering (Q6, Q7, Q8)

In [11]:
# Train Agglomerative Clustering with specified parameters
# Note: 'affinity' parameter is now 'metric' in newer scikit-learn versions
agg_clustering = AgglomerativeClustering(
    n_clusters=4, 
    metric='euclidean', 
    linkage='ward'
)

y_pred_agg = agg_clustering.fit_predict(X_scaled)

print("Agglomerative Clustering trained successfully!")
print(f"Number of clusters: {agg_clustering.n_clusters}")
print(f"Number of leaves: {agg_clustering.n_leaves_}")
print(f"Predicted labels: {np.unique(y_pred_agg)}")

Agglomerative Clustering trained successfully!
Number of clusters: 4
Number of leaves: 7050
Predicted labels: [0 1 2 3]


In [12]:
# Q6: Label predicted for first row
first_row_label = y_pred_agg[0]
print(f"Q6: Label predicted for first row of samples: {first_row_label}")

# Q7: Number of leaves in the hierarchical tree
num_leaves = agg_clustering.n_leaves_
print(f"\nQ7: Number of leaves in the hierarchical tree: {num_leaves}")

# Q8: Calculate accuracy of Agglomerative Clustering
accuracy_agg, mapping_agg = cluster_accuracy(y_encoded, y_pred_agg)
accuracy_percentage = accuracy_agg * 100

print(f"\nQ8: Accuracy of Agglomerative Clustering model: {accuracy_percentage:.2f}%")
print(f"Correctly predicted labels: {int(accuracy_agg * len(y_encoded))} out of {len(y_encoded)}")

Q6: Label predicted for first row of samples: 3

Q7: Number of leaves in the hierarchical tree: 7050

Q8: Accuracy of Agglomerative Clustering model: 62.58%
Correctly predicted labels: 4412 out of 7050


## Summary of Answers

In [13]:
print("=" * 60)
print("SUMMARY OF ALL ANSWERS")
print("=" * 60)
print(f"\nQ1: Shape of the data: {df.shape}")
print(f"Q2: Number of features containing Null values: {len(features_with_nulls)}")
print(f"Q3: Number of unique values in 'status_type': {unique_status_types}")
print(f"Q4: Inertia score at k=2: {inertia_k2:.2f}")
print(f"Q5: Labels predicted accurately at k=4: {int(accuracy_kmeans * len(y_encoded))}")
print(f"Q6: Label predicted for first row: {first_row_label}")
print(f"Q7: Number of leaves in hierarchical tree: {num_leaves}")
print(f"Q8: Accuracy of the model (in percentage): {accuracy_percentage:.2f}%")
print("=" * 60)

SUMMARY OF ALL ANSWERS

Q1: Shape of the data: (7050, 16)
Q2: Number of features containing Null values: 4
Q3: Number of unique values in 'status_type': 4
Q4: Inertia score at k=2: 48802.48
Q5: Labels predicted accurately at k=4: 4477
Q6: Label predicted for first row: 3
Q7: Number of leaves in hierarchical tree: 7050
Q8: Accuracy of the model (in percentage): 62.58%
