# Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


#validation
from sklearn.metrics import calinski_harabasz_score

: 

# Loading the Data Set

In [None]:
# loading the dataset into a dataframe
data_df = pd.read_csv('data-final.csv')

: 

# Data Preprocessing

In [None]:
# get the number of columns
num_columns = len(data_df.columns)

# print the number of columns
print("Number of columns:", num_columns)

: 

In [None]:
# select the necessary columns
data_df.drop(data_df.columns[50:107], axis=1, inplace=True)
data_df.drop(data_df.columns[51:], axis=1, inplace=True)

# show the first 5 rows of the new dataframe
data_df.head(10)

: 

In [None]:
# view the number of participants who responded to the survey
print('Number of participants: ', len(data_df))

: 

In [None]:
# handling missing values

print('Number of missing values: ', data_df.isnull().values.sum())
data_df.dropna(inplace=True)

: 

In [None]:
# create a new dataframe without null values containing only the necessary columns

clean_df = data_df.drop('country', axis=1)
columns = list(clean_df.columns)

: 

In [None]:
clean_df.head(10)

: 

In [None]:
# save the clean data into a new csv file for future use

clean_df.to_csv('clean_data.csv', index=False)

: 

In [None]:
# loading the clean dataset into a dataframe

df = pd.read_csv('clean_data.csv')
df.head(10)

: 

In [None]:
# Groups and Questions
ext_questions = {'EXT1' : 'I am the life of the party',
                 'EXT2' : 'I dont talk a lot',
                 'EXT3' : 'I feel comfortable around people',
                 'EXT4' : 'I keep in the background',
                 'EXT5' : 'I start conversations',
                 'EXT6' : 'I have little to say',
                 'EXT7' : 'I talk to a lot of different people at parties',
                 'EXT8' : 'I dont like to draw attention to myself',
                 'EXT9' : 'I dont mind being the center of attention',
                 'EXT10': 'I am quiet around strangers'}

est_questions = {'EST1' : 'I get stressed out easily',
                 'EST2' : 'I am relaxed most of the time',
                 'EST3' : 'I worry about things',
                 'EST4' : 'I seldom feel blue',
                 'EST5' : 'I am easily disturbed',
                 'EST6' : 'I get upset easily',
                 'EST7' : 'I change my mood a lot',
                 'EST8' : 'I have frequent mood swings',
                 'EST9' : 'I get irritated easily',
                 'EST10': 'I often feel blue'}

agr_questions = {'AGR1' : 'I feel little concern for others',
                 'AGR2' : 'I am interested in people',
                 'AGR3' : 'I insult people',
                 'AGR4' : 'I sympathize with others feelings',
                 'AGR5' : 'I am not interested in other peoples problems',
                 'AGR6' : 'I have a soft heart',
                 'AGR7' : 'I am not really interested in others',
                 'AGR8' : 'I take time out for others',
                 'AGR9' : 'I feel others emotions',
                 'AGR10': 'I make people feel at ease'}

csn_questions = {'CSN1' : 'I am always prepared',
                 'CSN2' : 'I leave my belongings around',
                 'CSN3' : 'I pay attention to details',
                 'CSN4' : 'I make a mess of things',
                 'CSN5' : 'I get chores done right away',
                 'CSN6' : 'I often forget to put things back in their proper place',
                 'CSN7' : 'I like order',
                 'CSN8' : 'I shirk my duties',
                 'CSN9' : 'I follow a schedule',
                 'CSN10' : 'I am exacting in my work'}

opn_questions = {'OPN1' : 'I have a rich vocabulary',
                 'OPN2' : 'I have difficulty understanding abstract ideas',
                 'OPN3' : 'I have a vivid imagination',
                 'OPN4' : 'I am not interested in abstract ideas',
                 'OPN5' : 'I have excellent ideas',
                 'OPN6' : 'I do not have a good imagination',
                 'OPN7' : 'I am quick to understand things',
                 'OPN8' : 'I use difficult words',
                 'OPN9' : 'I spend time reflecting on things',
                 'OPN10': 'I am full of ideas'}


# group names and columns

EXT = [column for column in df if column.startswith('EXT')]
EST = [column for column in df if column.startswith('EST')]
AGR = [column for column in df if column.startswith('AGR')]
CSN = [column for column in df if column.startswith('CSN')]
OPN = [column for column in df if column.startswith('OPN')]

: 

# Explanatory Data Analysis

In [None]:
# Participants' Nationality Distriution

countries = pd.DataFrame(data_df['country'].value_counts())
countries_5000 = countries[countries['country'] >= 5000]
plt.figure(figsize=(15,5))
sns.barplot(data=countries_5000, x=countries_5000.index, y='country')
plt.title('Participants\' Nationality Distriution')
plt.ylabel('Participants');
plt.ylabel('Nationality');

: 

In [None]:
print('Number of responses after handling missing values: ', len(df))

: 

In [None]:
# funtion to visualize the answer distribution for the questions

def vis_questions(groupname, questions, color):
    plt.figure(figsize=(40,60))
    for i in range(1, 11):
        plt.subplot(10,5,i)
        plt.hist(df[groupname[i-1]], bins=14, color= color, alpha=.5)
        plt.title(questions[groupname[i-1]], fontsize=18)

: 

In [None]:
# OPENNESS

print('Q&As Related to Openness Personality\n')
vis_questions(OPN, opn_questions, 'orange')

: 

In [None]:
# CONSCIENTIOUS

print('Q&As Related to Conscientious Personality\n')
vis_questions(CSN, csn_questions, 'purple')

: 

In [None]:
# EXTROVERSION

print('Q&As Related to Extroversion Personality\n')
vis_questions(EXT, ext_questions, 'red')

: 

In [None]:
# AGREEABLENESS

print('Q&As Related to Agreeable Personality\n')
vis_questions(AGR, agr_questions, 'blue')

: 

In [None]:
# NEUROTICISM

print('Q&As Related to Neuroticism Personality\n')
vis_questions(EST, est_questions, 'green')

: 

# K-means Clustering

## Elbow Visualization

In [None]:
# For ease of calculation lets scale all the values between 0-1 and take a sample of 5000
from sklearn.preprocessing import MinMaxScaler

columns = list(clean_df.columns)

scaler = MinMaxScaler(feature_range=(0,1))
df = scaler.fit_transform(df)
df = pd.DataFrame(df, columns=columns)
df_sample = df[:5000]

: 

In [None]:
# Visualize the elbow
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

kmeans = KMeans()
visualizer = KElbowVisualizer(kmeans, k=(2,15))
visualizer.fit(df_sample)
visualizer.poof()

: 

**As you can see 5 clusters looks optimum for the data set.**

In [None]:
# create the K-means Cluster Model
from sklearn.cluster import KMeans

# define 5 clusters and fit the model
kmeans = KMeans(n_clusters=5)
k_fit = kmeans.fit(clean_df)

: 

In [None]:
# predicting the clusters for the dataset

# pd.options.display.max_columns = 10
predictions = k_fit.labels_
clean_df['cluster'] = predictions
clean_df.head(25)

: 

In [None]:
clean_df.to_csv('Personality_clusters.csv', index=False)

: 

##Analysing the Model and Predictions

In [None]:
# get the number of individuals for each cluster

clean_df.cluster.value_counts()

: 

In [None]:
# get the average answer to the each question group for each cluster.

pd.options.display.max_columns = 150
clean_df.groupby('cluster').mean()

: 

In [None]:
# get the average answer to the each question group for each response

col_list = list(clean_df)
ext = col_list[0:10]
est = col_list[10:20]
agr = col_list[20:30]
csn = col_list[30:40]
opn = col_list[40:50]

data = pd.DataFrame()
data['openness'] = clean_df[opn].sum(axis=1)/10
data['conscientiousness'] = clean_df[csn].sum(axis=1)/10
data['extraversion'] = clean_df[ext].sum(axis=1)/10
data['agreeableness'] = clean_df[agr].sum(axis=1)/10
data['neuroticism'] = clean_df[est].sum(axis=1)/10
data['cluster'] = predictions
data.groupby('cluster').mean()

: 

In [None]:
# visualize the obtained the distribution of the Big Five personality ratings for each of the five clusters

dataclusters = data.groupby('cluster').mean()
plt.figure(figsize=(22,3))
for i in range(0, 5):
    plt.subplot(1,5,i+1)
    plt.bar(dataclusters.columns, dataclusters.iloc[i, :], color='green', alpha=0.2)
    plt.plot(dataclusters.columns, dataclusters.iloc[i, :], color='red')
    plt.title('Cluster ' + str(i))
    plt.xticks(rotation=45)
    plt.ylim(0,4);

print(dataclusters)

: 

##Visualizing the Clusters

In [None]:
# applying PCA to the data matrix to obtain the principal components - PCA is used for dimensionality reduction before applying k-means clustering
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_fit = pca.fit_transform(clean_df)

df_pca = pd.DataFrame(data=pca_fit, columns=['PCA1', 'PCA2'])
df_pca['cluster'] = predictions
df_pca.head()

: 

In [None]:
# visualize the five personality clusters after applying PCA

plt.figure(figsize=(10,10))
sns.scatterplot(data=df_pca, x='PCA1', y='PCA2', hue='cluster', palette='Set2', alpha=0.8)
plt.title('Personality Clusters after PCA');

: 

# Saving the model

In [None]:
import pickle
from sklearn.cluster import KMeans

# Save the model to a pickle file
with open('kmeans_model.pkl', 'wb') as file:
    pickle.dump(k_fit, file)

print("K-means model saved successfully.")

: 

In [None]:
#######################################################################################################################

: 

In [None]:
# connecting to google drive

from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Intellihire

: 

In [None]:
# load the CSV file into a Pandas DataFrame

responses = pd.read_csv('/content/drive/MyDrive/Intellihire/Responses.csv', header=None)
responses

: 

In [None]:
import pickle

# Load the model from the pickle file
model = pickle.load(open('kmeans_model.pkl', 'rb'))
print(model.predict([[4,3,2,1,3,3,2,2,1,4,3,5,5,4,5,5,5,4,4,4,5,3,3,4,5,4,3,2,1,3,3,2,2,1,4,3,5,5,4,5,5,5,4,4,4,5,3,3,4,5]]))

: 