In [4]:
# calculate the total number of clusters : k
# k number of initial centroids
# assign every point to the nearest centroid using ucliden distance
# recalcuate the centroid of the cluster by calculating means of all the values belonging to that cluster
# Repeat the steps 3 and 4 until we reach the state of convergence

import numpy as np
from sklearn.datasets import make_blobs

In [7]:
class KMeans:
  def __init__(self, n_clusters, max_iter=100):
    self.n_clusters = n_clusters
    self.max_iter = max_iter

  def fit(self, X):

    # initialize our centroids:
    self.centroids = X[np.random.choice(X.shape[0], self.n_clusters, replace=False)]

    for _ in range(self.max_iter):
      # assign each data point to the nearest centroid:
      labels = self.assign_labels(X)

      # update centroid:

      new_centroid = self.update_centroid(X, labels)


      # Check for convergence

      if np.all(self.centroids == new_centroid):
        break

      self.centroids = new_centroid


  def assign_labels(self, X):
    # compute the distancew from each point to the centroids
    distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)

    # assign the label based on the nearest centroid
    labels = np.argmin(distances, axis=0)

    return labels

  def update_centroid(self, X, labels):
    # compute the new centroid by taking the mean of all the points belonging to that cluster
    new_centroids = np.array([X[labels == i].mean(axis=0) if np.sum(labels == i) > 0
                              else self.centroids[i] # Keep the old centroid if no points are assigned to it
                              for i in range(self.n_clusters)])

    return new_centroids


# generate synthetic data for sklearn.make_blobs

X, y = make_blobs(n_samples=300, centers=3, random_state=42)


# create a Kmeans instance by picking an initial value of K = 3

kmeans = KMeans(n_clusters=3)

# fit the model to the data

kmeans.fit(X)

# get the cluster assignments from each and every data point that we hae in the clusters

labels = kmeans.assign_labels(X)

print("cluster assignments/labels", labels)

print("final centroids", kmeans.centroids)

cluster assignments/labels [ 69 179 158]
final centroids [[-3.57150134  9.48787856]
 [ 5.26399865  2.60151519]
 [-6.24103473 -8.54162966]]


In [9]:
# kaggle dataset: https://www.kaggle.com/datasets/flyingwombat/us-news-and-world-reports-college-data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [23]:
df = pd.read_csv("College.csv")
df

Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,Worcester State College,No,2197,1515,543,4,26,3089,2029,6797,3900,500,1200,60,60,21.0,14,4469,40
773,Xavier University,Yes,1959,1805,695,24,47,2849,1107,11520,4960,600,1250,73,75,13.3,31,9189,83
774,Xavier University of Louisiana,Yes,2097,1915,695,34,61,2793,166,6900,4200,617,781,67,75,14.4,20,8323,49
775,Yale University,Yes,10705,2453,1317,95,99,5217,83,19840,6510,630,2115,96,96,5.8,49,40386,99


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   777 non-null    object 
 1   Private      777 non-null    object 
 2   Apps         777 non-null    int64  
 3   Accept       777 non-null    int64  
 4   Enroll       777 non-null    int64  
 5   Top10perc    777 non-null    int64  
 6   Top25perc    777 non-null    int64  
 7   F.Undergrad  777 non-null    int64  
 8   P.Undergrad  777 non-null    int64  
 9   Outstate     777 non-null    int64  
 10  Room.Board   777 non-null    int64  
 11  Books        777 non-null    int64  
 12  Personal     777 non-null    int64  
 13  PhD          777 non-null    int64  
 14  Terminal     777 non-null    int64  
 15  S.F.Ratio    777 non-null    float64
 16  perc.alumni  777 non-null    int64  
 17  Expend       777 non-null    int64  
 18  Grad.Rate    777 non-null    int64  
dtypes: float

In [14]:
sns.set_style('whitegrid')
sns.lmplot('Room.Board','Grad.Rate',data=df, hue='Private',
           palette='coolwarm',size=6,aspect=1,fit_reg=False)
print(lmplot)

TypeError: lmplot() got multiple values for argument 'data'

In [15]:
sns.set_style('whitegrid')
sns.lmplot('Outstate','F.Undergrad',data=df, hue='Private',
           palette='coolwarm',size=6,aspect=1,fit_reg=False)
print(lmplot)

TypeError: lmplot() got multiple values for argument 'data'

In [16]:
df[df['Grad.Rate'] > 100]

Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
95,Cazenovia College,Yes,3847,3433,527,9,35,1010,12,9384,4840,600,500,22,47,14.3,20,7697,118


In [17]:
df['Grad.Rate']['Cazenovia College'] = 100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Grad.Rate']['Cazenovia College'] = 100


In [18]:
from sklearn.cluster import KMeans

In [19]:
kmeans = KMeans(n_clusters=2)

In [25]:
# Convert non-numeric columns (excluding 'Private' which is already dropped) to numeric
for col in df.columns:
    if df[col].dtype == 'object' and col != 'Private':
        try:
            df[col] = pd.to_numeric(df[col])
        except:
            print(f"Could not convert column {col} to numeric. Consider encoding or dropping.")

# Fit the model, excluding non-numeric columns (Private is assumed to be already dropped)
kmeans.fit(df.select_dtypes(include=['number']))

Could not convert column Unnamed: 0 to numeric. Consider encoding or dropping.




In [26]:
kmeans.cluster_centers_

array([[1.81323468e+03, 1.28716592e+03, 4.91044843e+02, 2.53094170e+01,
        5.34708520e+01, 2.18854858e+03, 5.95458894e+02, 1.03957085e+04,
        4.31136472e+03, 5.41982063e+02, 1.28033632e+03, 7.04424514e+01,
        7.78251121e+01, 1.40997010e+01, 2.31748879e+01, 8.93204634e+03,
        6.51195815e+01],
       [1.03631389e+04, 6.55089815e+03, 2.56972222e+03, 4.14907407e+01,
        7.02037037e+01, 1.30619352e+04, 2.46486111e+03, 1.07191759e+04,
        4.64347222e+03, 5.95212963e+02, 1.71420370e+03, 8.63981481e+01,
        9.13333333e+01, 1.40277778e+01, 2.00740741e+01, 1.41705000e+04,
        6.75925926e+01]])

In [28]:
def conveter(cluster):
  if cluster=="Yes":
    return 1
  else:
    return 0


In [29]:
df['Cluster'] = df['Private'].apply(conveter)

In [30]:
df.head()

Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate,Cluster
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60,1
1,Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56,1
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54,1
3,Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59,1
4,Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15,1


In [31]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(df['Cluster'],kmeans.labels_))
print(classification_report(df['Cluster'],kmeans.labels_))

[[138  74]
 [531  34]]
              precision    recall  f1-score   support

           0       0.21      0.65      0.31       212
           1       0.31      0.06      0.10       565

    accuracy                           0.22       777
   macro avg       0.26      0.36      0.21       777
weighted avg       0.29      0.22      0.16       777

