In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv('../datasets/RIASEC_Dataset_preprocessed.csv')


In [3]:
if 'major' in df.columns:
    major_labels = df['major']
    df_features = df.drop(columns=['major'])
else:
    major_labels = None
    df_features = df.copy()

In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_features.select_dtypes(include=['float64', 'int64']))


In [5]:
text_cols = df_features.select_dtypes(include=['object']).columns
if len(text_cols) > 0:
    print(f"Đang vectorize các cột text: {list(text_cols)}")
    vectorizer = TfidfVectorizer(stop_words='english')
    X_text = vectorizer.fit_transform(df_features[text_cols[0]]) 
    X_text_dense = X_text.toarray()
    from numpy import hstack
    X_all = hstack([X_scaled, X_text_dense])
else:
    X_all = X_scaled

In [6]:
optimal_k = 20
print(f"Đang phân cụm với k={optimal_k}")

kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['major_cluster'] = kmeans.fit_predict(X_all)

Đang phân cụm với k=20


In [7]:
if X_all.shape[1] == 2:
    plt.figure(figsize=(8, 5))
    plt.scatter(X_all[:, 0], X_all[:, 1], c=df['major_cluster'], cmap='rainbow')
    plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
                color='black', marker='x', label='Cluster Centers')
    plt.title(f'KMeans Clustering (k={optimal_k})')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.show()

In [8]:
if major_labels is not None:
    df['major'] = major_labels
    for i in range(optimal_k):
        print(f"\n--- Cluster {i} ---")
        print(df[df['major_cluster'] == i]['major'].value_counts().head(5))


--- Cluster 0 ---
major
psychology                191
computer science          161
biology                   155
engineering               114
mechanical engineering    104
Name: count, dtype: int64

--- Cluster 1 ---
major
psychology     340
business       313
economics      136
marketing       95
engineering     87
Name: count, dtype: int64

--- Cluster 2 ---
major
business                   424
psychology                 284
accounting                 252
business administration    119
finance                    117
Name: count, dtype: int64

--- Cluster 3 ---
major
psychology    935
nursing       243
business      171
biology       171
education     119
Name: count, dtype: int64

--- Cluster 4 ---
major
psychology          247
business            151
english             118
computer science     75
marketing            62
Name: count, dtype: int64

--- Cluster 5 ---
major
psychology          382
biology             251
nursing             144
computer science     80
business      

In [None]:
df.to_csv('../datasets/RIASEC_Dataset_clustered_v2.csv', index=False)
print("✅ Đã lưu file: ../datasets/RIASEC_Dataset_clustered_v2.csv")