In [27]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans

In [28]:
#Load the cleaned dataset
file_path = "colleges_cleaned.csv"
colleges_df = pd.read_csv(file_path, index_col=0)
colleges_df.head()

Unnamed: 0,CollegeID,State,Applicants_total,Admissions_total,Enrolled_total,Percent_of_freshmen_submitting_SAT_scores,Percent_of_freshmen_submitting_ACT_scores,"Estimated_enrollment,_total","Estimated_enrollment,_full_time","Estimated_enrollment,_part_time",...,Far_West_Region,Great_Lakes_Region,Mid_East_Region,New_England_Region,Plains_Region,Rocky_Mountains_Region,Southeast_Region,Southwest_Region,US_Service_Schools_Region,average_tuition
0,1,1,6142.0,5521.0,1104.0,15.0,88.0,5024.0,4442.0,582.0,...,0,0,0,0,0,0,1,0,0,6.748
1,2,1,5689.0,4934.0,1773.0,6.0,93.0,18568.0,11961.0,6607.0,...,0,0,0,0,0,0,1,0,0,6.5185
3,3,1,2054.0,1656.0,651.0,34.0,94.0,7376.0,4802.0,2574.0,...,0,0,0,0,0,0,1,0,0,8.393
4,4,1,10245.0,5251.0,1479.0,18.0,87.0,6076.0,5183.0,893.0,...,0,0,0,0,0,0,1,0,0,7.9745
5,5,1,30975.0,17515.0,6454.0,23.0,76.0,34752.0,29498.0,5254.0,...,0,0,0,0,0,0,1,0,0,8.7875


In [29]:
# Remove CollegeID column
colleges_df.drop(columns=["CollegeID"], inplace=True)
colleges_df.head()

Unnamed: 0,State,Applicants_total,Admissions_total,Enrolled_total,Percent_of_freshmen_submitting_SAT_scores,Percent_of_freshmen_submitting_ACT_scores,"Estimated_enrollment,_total","Estimated_enrollment,_full_time","Estimated_enrollment,_part_time","Estimated_undergraduate_enrollment,_total",...,Far_West_Region,Great_Lakes_Region,Mid_East_Region,New_England_Region,Plains_Region,Rocky_Mountains_Region,Southeast_Region,Southwest_Region,US_Service_Schools_Region,average_tuition
0,1,6142.0,5521.0,1104.0,15.0,88.0,5024.0,4442.0,582.0,4055.0,...,0,0,0,0,0,0,1,0,0,6.748
1,1,5689.0,4934.0,1773.0,6.0,93.0,18568.0,11961.0,6607.0,11502.0,...,0,0,0,0,0,0,1,0,0,6.5185
3,1,2054.0,1656.0,651.0,34.0,94.0,7376.0,4802.0,2574.0,5696.0,...,0,0,0,0,0,0,1,0,0,8.393
4,1,10245.0,5251.0,1479.0,18.0,87.0,6076.0,5183.0,893.0,5357.0,...,0,0,0,0,0,0,1,0,0,7.9745
5,1,30975.0,17515.0,6454.0,23.0,76.0,34752.0,29498.0,5254.0,29440.0,...,0,0,0,0,0,0,1,0,0,8.7875


In [31]:
colleges_df.hvplot.scatter(x="Enrolled_total", y="average_tuition")

In [32]:
# Function to cluster and plot dataset
def test_cluster_amount(colleges_df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model

    # Fitting model
    model.fit(colleges_df)

  # Add a new  column to colleges_df
    colleges_df["clustermod"] = model.labels_  
    


In [33]:
test_cluster_amount(colleges_df,2)
colleges_df.hvplot.scatter(x="Enrolled_total", y="average_tuition", by="clustermod")

In [34]:
fig = px.scatter_3d(
    colleges_df,
    x="Enrolled_total",
    y="average_tuition",
    z="Applicants_total",
    color="clustermod",
    symbol="clustermod",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [37]:
test_cluster_amount(colleges_df,7)
colleges_df.hvplot.scatter(x="Enrolled_total", y="average_tuition", by="clustermod")

fig = px.scatter_3d(
    colleges_df,
    x="Enrolled_total",
    y="average_tuition",
    z="Applicants_total",
    color="clustermod",
    symbol="clustermod",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [38]:
#create an empty list to hold inertia values

inertia = []
k = list(range(1, 20))

In [39]:
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(colleges_df)
    inertia.append(km.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=5.



In [40]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
colleges_df_elbow = pd.DataFrame(elbow_data)
colleges_df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [41]:
def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["clustermod"] = model.labels_

    return data


In [42]:
seven_clusters=get_clusters(7, colleges_df)
seven_clusters.head()

Unnamed: 0,State,Applicants_total,Admissions_total,Enrolled_total,Percent_of_freshmen_submitting_SAT_scores,Percent_of_freshmen_submitting_ACT_scores,"Estimated_enrollment,_total","Estimated_enrollment,_full_time","Estimated_enrollment,_part_time","Estimated_undergraduate_enrollment,_total",...,Great_Lakes_Region,Mid_East_Region,New_England_Region,Plains_Region,Rocky_Mountains_Region,Southeast_Region,Southwest_Region,US_Service_Schools_Region,average_tuition,clustermod
0,1,6142.0,5521.0,1104.0,15.0,88.0,5024.0,4442.0,582.0,4055.0,...,0,0,0,0,0,1,0,0,6.748,0
1,1,5689.0,4934.0,1773.0,6.0,93.0,18568.0,11961.0,6607.0,11502.0,...,0,0,0,0,0,1,0,0,6.5185,4
3,1,2054.0,1656.0,651.0,34.0,94.0,7376.0,4802.0,2574.0,5696.0,...,0,0,0,0,0,1,0,0,8.393,0
4,1,10245.0,5251.0,1479.0,18.0,87.0,6076.0,5183.0,893.0,5357.0,...,0,0,0,0,0,1,0,0,7.9745,0
5,1,30975.0,17515.0,6454.0,23.0,76.0,34752.0,29498.0,5254.0,29440.0,...,0,0,0,0,0,1,0,0,8.7875,2


In [44]:
#Plotting the 3D-Scatter
seven_clusters.hvplot.scatter(x="Enrolled_total", y="average_tuition", by="clustermod")

fig = px.scatter_3d(
    seven_clusters,
    x="Enrolled_total",
    y="average_tuition",
    z="Applicants_total",
    color="clustermod",
    symbol="clustermod",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()