In [28]:
# Import necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

In [None]:
# Load dataset (Ensure you have 'data-final.csv' in your working directory)
data_raw = pd.read_csv('/content/data-final.csv', sep='\t')
data = data_raw.copy()

# Display all columns
pd.options.display.max_columns = 150

# Drop unnecessary columns (Retaining first 50 personality trait columns)
data.drop(data.columns[50:107], axis=1, inplace=True)
data.drop(data.columns[51:], axis=1, inplace=True)

# Display number of participants and sample data
print('Number of participants:', len(data))
data.head()


In [32]:
# Drop missing values
data.dropna(inplace=True)

# Define Personality Trait Groups
EXT = [col for col in data if col.startswith('EXT')]  # Extraversion
EST = [col for col in data if col.startswith('EST')]  # Neuroticism
AGR = [col for col in data if col.startswith('AGR')]  # Agreeableness
CSN = [col for col in data if col.startswith('CSN')]  # Conscientiousness
OPN = [col for col in data if col.startswith('OPN')]  # Openness

# Select all 50 personality trait columns
personality_traits = EXT + EST + AGR + CSN + OPN

# Features (X)
X = data[personality_traits]

# Standardize the Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [33]:
# Apply K-Means Clustering to create job categories
num_clusters = 5  # Define the number of job categories
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
data['Job_Category'] = kmeans.fit_predict(X_scaled)

# Convert Job Categories to Strings
data['Job_Category'] = data['Job_Category'].astype(str)

# Display the first few rows with assigned job categories
data[['Job_Category'] + personality_traits].head()


Unnamed: 0,Job_Category,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,EST1,EST2,EST3,EST4,EST5,EST6,EST7,EST8,EST9,EST10,AGR1,AGR2,AGR3,AGR4,AGR5,AGR6,AGR7,AGR8,AGR9,AGR10,CSN1,CSN2,CSN3,CSN4,CSN5,CSN6,CSN7,CSN8,CSN9,CSN10,OPN1,OPN2,OPN3,OPN4,OPN5,OPN6,OPN7,OPN8,OPN9,OPN10
0,0,4.0,1.0,5.0,2.0,5.0,1.0,5.0,2.0,4.0,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,5.0,2.0,4.0,2.0,3.0,2.0,4.0,3.0,4.0,3.0,4.0,3.0,2.0,2.0,4.0,4.0,2.0,4.0,4.0,5.0,1.0,4.0,1.0,4.0,1.0,5.0,3.0,4.0,5.0
1,2,3.0,5.0,3.0,4.0,3.0,3.0,2.0,5.0,1.0,5.0,2.0,3.0,4.0,1.0,3.0,1.0,2.0,1.0,3.0,1.0,1.0,4.0,1.0,5.0,1.0,5.0,3.0,4.0,5.0,3.0,3.0,2.0,5.0,3.0,3.0,1.0,3.0,3.0,5.0,3.0,1.0,2.0,4.0,2.0,3.0,1.0,4.0,2.0,5.0,3.0
2,0,2.0,3.0,4.0,4.0,3.0,2.0,1.0,3.0,2.0,5.0,4.0,4.0,4.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,1.0,4.0,1.0,4.0,2.0,4.0,1.0,4.0,4.0,3.0,4.0,2.0,2.0,2.0,3.0,3.0,4.0,2.0,4.0,2.0,5.0,1.0,2.0,1.0,4.0,2.0,5.0,3.0,4.0,4.0
3,2,2.0,2.0,2.0,3.0,4.0,2.0,2.0,4.0,1.0,4.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,4.0,3.0,2.0,4.0,3.0,4.0,2.0,4.0,2.0,4.0,3.0,4.0,2.0,4.0,4.0,4.0,1.0,2.0,2.0,3.0,1.0,4.0,4.0,2.0,5.0,2.0,3.0,1.0,4.0,4.0,3.0,3.0
4,0,3.0,3.0,3.0,3.0,5.0,3.0,3.0,5.0,3.0,4.0,1.0,5.0,5.0,3.0,1.0,1.0,1.0,1.0,3.0,2.0,1.0,5.0,1.0,5.0,1.0,3.0,1.0,5.0,5.0,3.0,5.0,1.0,5.0,1.0,3.0,1.0,5.0,1.0,5.0,5.0,5.0,1.0,5.0,1.0,5.0,1.0,5.0,3.0,5.0,5.0


In [34]:
# Define Job Category Mapping Based on Clusters
job_mapping = {
    "0": "Marketing & Sales",
    "1": "Psychology & Arts",
    "2": "Healthcare & HR",
    "3": "Finance & Engineering",
    "4": "Research & Data Science"
}

# Assign human-readable job categories
data['Job_Category'] = data['Job_Category'].map(job_mapping)

# Show updated job category distribution
print(data['Job_Category'].value_counts())


Job_Category
Marketing & Sales          282678
Healthcare & HR            279328
Psychology & Arts          237676
Finance & Engineering      208999
Research & Data Science      4800
Name: count, dtype: int64


In [35]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, data['Job_Category'], test_size=0.2, random_state=42, stratify=data['Job_Category'])

# Train KNN Model on full features
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Make Predictions
y_pred = knn.predict(X_test)

# Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy (No PCA): {accuracy * 100:.2f}%")


Model Accuracy (No PCA): 86.57%


In [37]:
# Distribution of Job Categories
fig = px.bar(data['Job_Category'].value_counts(),
             labels={'index': 'Job Category', 'value': 'Count'},
             title="Distribution of Job Categories")
fig.show()


In [36]:
# Example Personality Scores (Modify as Needed)
new_user = np.array([[7, 8, 6, 7, 5]])  # Example scores for 5 traits
new_user = np.pad(new_user[0], (0, len(personality_traits) - len(new_user[0])), 'constant', constant_values=0)
new_user = new_user.reshape(1, -1)

# Standardize the New User's Data
new_user_scaled = scaler.transform(new_user)

# Predict Job Category for New User
recommended_job = knn.predict(new_user_scaled)[0]

print(f"Recommended Job: {recommended_job}")


Recommended Job: Research & Data Science



X does not have valid feature names, but StandardScaler was fitted with feature names

