In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
zabihullah18_students_social_network_profile_clustering_path = kagglehub.dataset_download('zabihullah18/students-social-network-profile-clustering')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/datasets/download/zabihullah18/students-social-network-profile-clustering?dataset_version_number=1...


100%|██████████| 187k/187k [00:00<00:00, 5.89MB/s]

Extracting files...
Data source import complete.





<center><a href="https://www.buymeacoffee.com/zabih"><img src="https://www.codehim.com/wp-content/uploads/2022/09/bmc-button-640x180.png.webp" alt="Buy Me A Coffee" style="height: 80px; width: 300px; box-shadow: 0px 3px 2px 0px rgba(190, 190, 190, 0.5) !important;-webkit-box-shadow: 0px 3px 2px 0px rgba(190, 190, 190, 0.5) !important;" ></a></center>

# 📌Loading Required Packages

In [2]:
#Important packages

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# 📌Check Out the Data

In [3]:
#Load Dataset
pd.set_option('display.max_columns', None)
data = pd.read_csv("/kaggle/input/students-social-network-profile-clustering/03_Clustering_Marketing.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/students-social-network-profile-clustering/03_Clustering_Marketing.csv'

In [None]:
#Running head command to see first 6 observations
data.head()

# 📌Summary Statistics of Numerical Variables

In [None]:
data.describe()

# 📌Treating Missing Values

In [None]:
data.isnull().sum()

**A total of 2496 records have missing ages. Also concerning is the fact that the minimum and maximum values seem to be unreasonable; it is unlikely that a 3 year old or a 106 year old is attending high school.**

# 📌Let's have a look at the number of male, female and missing values

In [None]:
data['gender'].value_counts(dropna = False) #display NaN values also

**There are 11057 female, 2606 male teen students and 1337 missing values**

# 📌Now fill all the null values in gender column with “Not disclosed”

In [None]:
data['gender'].fillna('not disclosed', inplace = True)


In [None]:
data['gender'].isnull().sum()

In [None]:
data['gender'].unique()

# 📌One way to deal with these missing values would be to fill the missing values with the average age of each graduation year

In [None]:
data['age'] = pd.to_numeric(data['age'], errors='coerce')

In [None]:
data.groupby('gradyear')['age'].mean()

# 📌Now fill the missing values for each graduation year with the mean that we got as above

In [None]:
data['age'] = data.groupby('gradyear')['age'].transform(lambda x: x.fillna(x.mean()))

In [None]:
data['age'].isnull().sum()

**From the above summary we can see that there are no missing values in the dataset**

# 📌Treating Outliers

**The original age range contains value from 3 - 106, which is unrealistic because student at age of 3 or 106 would not attend high school. A reasonable age range for people attending high school will be the age range between 13 to 21. The rest should be treated as outliers keeping the age of student going to high school in mind.

**Let's detect the outliers using a box plot below**

In [None]:
sns.set(style="whitegrid")

plt.figure(figsize=(10, 6))

sns.boxplot(x=data['age'], color='red')

plt.xlabel('Age', fontsize=14)
plt.title('Boxplot of Age', fontsize=16)

# Show the plot
plt.show()


In [None]:
q1 = data['age'].quantile(0.25)
q3 = data['age'].quantile(0.75)
iqr = q3-q1
print(iqr)

In [None]:
df = data[(data['age'] > (q1 - 1.5*iqr)) & (data['age'] < (q3 + 1.5*iqr))]

In [None]:
df['age'].describe()

**From the above summary we can observe that after treating the outliers the mininmum age is 13.843000 and the maximum age is 21.065000**

In [None]:
df.shape

In [None]:
sns.boxplot(df["age"])
plt.show()

**From the above boxplot we observe that there are no outliers in the age column**

# 📌Data Preprocessing

In [None]:
names = df.columns[4:40]
scaled_feature = data.copy()
names

In [None]:
scaled_feature.head()

In [None]:
features = scaled_feature[names]

In [None]:
features.head()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(features.values)

In [None]:
features = scaler.transform(features.values)

In [None]:
scaled_feature[names] = features
scaled_feature.head()

# 📌Convert object variable to numeric

In [None]:
def gender_to_numeric(x):
    if x=="M":
        return 1
    if x=="F":
        return 2
    if x== 'not disclosed':
        return 3

In [None]:
scaled_feature['gender'] = scaled_feature['gender'].apply(gender_to_numeric)
scaled_feature['gender'].head()

# 📌Checkig the transformed values

In [None]:
scaled_feature.head()

# 📌Building the K-means model

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=0)

In [None]:
model = kmeans.fit(scaled_feature)

# 📌Elbow Method*

In [None]:
# Creating a function with KMeans to plot "The Elbow Curve"

wcss = []
for i in range(1,20):
    kmeans = KMeans(n_clusters=i,init='k-means++' ,max_iter=300,random_state=0)
    kmeans.fit(scaled_feature)
    wcss.append(kmeans.inertia_)

plt.plot(range(1,20),wcss)
plt.title('THe Elbow Curve')
plt.xlabel('Number of Clusters')
plt.ylabel("WCSS") #WCSS stands for total within-cluster sum of sqaure
plt.show()

**The location of a bend (knee) in the plot is generally considered as an indicator of the appropriate number of clusters. Our Elbow point is around cluster size of 5.
We will use k=5 to further interpret our clustering result.**

# 📌Fit K-Means clustering for k=5

In [None]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(scaled_feature)

In [None]:
kmeans.labels_

In [None]:
len(kmeans.labels_)

In [None]:
data['cluster'] = kmeans.labels_

# 📌Interpreting Clustering Results

In [None]:
plt.figure(figsize=(12,7))
axis = sns.barplot(x=np.arange(0,5,1), y=data.groupby(['cluster']).count()['age'].values)
x=axis.set_xlabel("cluster Number")
x=axis.set_ylabel("Number of students")

**From the above plot we can see that cluster 0 is the largest and cluster 2 has fewest teen students.**

# 📌Let' see the number of students belonging to each cluster

In [None]:
size_array = list(data.groupby(['cluster']).count()['age'].values)
size_array

## 📌Average Age in each cluster

In [None]:
# Calculate the mean age for each cluster
mean_age = data.groupby('cluster')['age'].mean()

# Print the mean age for each cluster in the desired format
for cluster, age in mean_age.items():
    print(f"Cluster {cluster}: {age:.2f} years")


In [None]:
data['gender'] = data['gender'].map({'F': 0, 'M': 1, 'not disclosed':3})


In [None]:
data['gender'].value_counts()


## 📌Percentage of females in each cluster

In [None]:
# Calculate the percentage of females in each cluster
percentage_female = data[data['gender'] == 0].groupby('cluster').size() / data.groupby('cluster').size() * 100

# Print the percentage of females in each cluster in a readable format
for cluster, percentage in percentage_female.items():
    print(f"Cluster {cluster}: {percentage:.2f}% females")


## 📌Average number of friends in each cluster

In [None]:
# Calculate the average number of friends in each cluster
mean_friends = data.groupby('cluster')['NumberOffriends'].mean()

# Print the average number of friends in each cluster in a readable format
for cluster, avg_friends in mean_friends.items():
    print(f"Cluster {cluster}: Average number of friends = {avg_friends:.2f}")


## 📌Cluster Analysis

In [None]:
cluster_analysis = pd.DataFrame({
    'Mean Age': mean_age,
    'Percentage Female': percentage_female,
    'Mean Number of Friends': mean_friends,
    'Number of Students': size_array
})

print("Cluster Analysis:")
for cluster, row in cluster_analysis.iterrows():
    print(f"Cluster {cluster}:")
    print(f"  Mean Age: {row['Mean Age']:.2f} years")
    print(f"  Percentage Female: {row['Percentage Female']:.2f}%")
    print(f"  Mean Number of Friends: {row['Mean Number of Friends']:.2f}")
    print(f"  Number of Students: {row['Number of Students']}")
    print()


In [None]:

clusters = cluster_analysis.index

fig, axs = plt.subplots(2, 2, figsize=(12, 10))

# Mean Age
axs[0, 0].bar(clusters, cluster_analysis['Mean Age'], color='skyblue')
axs[0, 0].set_title('Mean Age by Cluster')
axs[0, 0].set_ylabel('Mean Age (years)')

# Percentage Female
axs[0, 1].bar(clusters, cluster_analysis['Percentage Female'], color='lightgreen')
axs[0, 1].set_title('Percentage Female by Cluster')
axs[0, 1].set_ylabel('Percentage Female')

# Mean Number of Friends
axs[1, 0].bar(clusters, cluster_analysis['Mean Number of Friends'], color='salmon')
axs[1, 0].set_title('Mean Number of Friends by Cluster')
axs[1, 0].set_ylabel('Mean Number of Friends')

# Number of Students
axs[1, 1].bar(clusters, cluster_analysis['Number of Students'], color='gold')
axs[1, 1].set_title('Number of Students by Cluster')
axs[1, 1].set_ylabel('Number of Students')

# Rotate x-axis labels for better readability
for ax in axs.flat:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()


## 📌Describe clusters characteristics

In [None]:
def describe_clusters(df):

    # Describe cluster characteristics
    cluster_analysis = df.groupby('cluster').mean().T
    print("Cluster Characteristics:")
    cluster_df = pd.DataFrame(cluster_analysis)

    return cluster_df

describe_clusters(data)

 # 📌Recommendation for marketing department

#### 👉Targeting Female-Centric Products:

Clusters 1, 2, and 3 have a higher percentage of females compared to the overall population. Therefore, products or advertisements that specifically target teenage girls may perform well within these clusters.
Products related to fashion, beauty, socializing, or lifestyle may resonate more with these clusters.


#### 👉Understanding Male-Centric Interests:

Clusters 0 and 4 have a higher percentage of males. Understanding the interests and preferences of teenage boys within these clusters can help develop marketing campaigns tailored to their needs.
Products related to sports, technology, gaming, or adventure may appeal more to these clusters.

#### 👉Fashion and Retail Promotions:

Clusters 2 and 3 show a higher interest in shopping, suggesting that advertising campaigns or promotions related to shopping might be more effective for these clusters.


#### 👉Sports and Lifestyle Brand Partnerships:

Collaborate with sports brands or organize sports-related events and activities to appeal to clusters showing a strong interest in sports (e.g., Clusters 3 and 4).


<center><a href="https://www.buymeacoffee.com/zabih"><img src="https://www.codehim.com/wp-content/uploads/2022/09/bmc-button-640x180.png.webp" alt="Buy Me A Coffee" style="height: 80px; width: 300px; box-shadow: 0px 3px 2px 0px rgba(190, 190, 190, 0.5) !important;-webkit-box-shadow: 0px 3px 2px 0px rgba(190, 190, 190, 0.5) !important;" ></a></center>