# Explore OCEAN features: EDA, Clustering and PCA/Visualization

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load and data preparation

In [None]:
df = pd.read_csv('../input/top-personality-dataset/2018-personality-data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# remove leading spaces in column names
df = df.rename(columns={' openness' : 'openness',
                ' agreeableness' : 'agreeableness',
                ' emotional_stability' : 'emotional_stability',
                ' conscientiousness' : 'conscientiousness',
                ' extraversion' : 'extraversion',
                ' assigned metric' : 'assigned metric',
                ' assigned condition' : 'assigned condition',
                ' movie_1' : 'movie_1',
                ' movie_2' : 'movie_2',  
                ' movie_3' : 'movie_3',  
                ' movie_4' : 'movie_4',  
                ' movie_5' : 'movie_5',  
                ' movie_6' : 'movie_6',  
                ' movie_7' : 'movie_7',  
                ' movie_8' : 'movie_8',  
                ' movie_9' : 'movie_9',  
                ' movie_10' : 'movie_10',
                ' movie_11' : 'movie_11',      
                ' movie_12' : 'movie_12',
                ' predicted_rating_1' : 'predicted_rating_1',
                ' predicted_rating_2' : 'predicted_rating_2',
                ' predicted_rating_3' : 'predicted_rating_3',
                ' predicted_rating_4' : 'predicted_rating_4',
                ' predicted_rating_5' : 'predicted_rating_5',
                ' predicted_rating_6' : 'predicted_rating_6',
                ' predicted_rating_7' : 'predicted_rating_7',
                ' predicted_rating_8' : 'predicted_rating_8',
                ' predicted_rating_9' : 'predicted_rating_9',
                ' predicted_rating_10' : 'predicted_rating_10',
                ' predicted_rating_11' : 'predicted_rating_11',
                ' predicted_rating_12' : 'predicted_rating_12',
                ' is_personalized' : 'is_personalized',
                ' enjoy_watching ' : 'enjoy_watching'})

In [None]:
df.columns

# EDA

In [None]:
ocean = ['openness','agreeableness','emotional_stability','conscientiousness','extraversion']

In [None]:
# pairwise scatter plot
sns.pairplot(df[ocean], kind='reg', plot_kws={'line_kws':{'color':'magenta'}, 'scatter_kws': {'alpha': 0.1}})
plt.show()

In [None]:
# correlation
cor_ocean = df[ocean].corr()
cor_ocean

In [None]:
plt.rcParams['figure.figsize']=(6,5)
sns.heatmap(cor_ocean, cmap=plt.cm.plasma)
plt.show()

# Clustering

In [None]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[ocean])
df_scaled

In [None]:
# define cluster algorithm
n_cl = 3
kmeans = KMeans(init="random", n_clusters=n_cl, n_init=10, max_iter=300, random_state=99)
# and run it
kmeans.fit(df_scaled)

In [None]:
# show cluster centers
kmeans.cluster_centers_

In [None]:
# append cluster variable
df['cluster'] = kmeans.labels_.astype('object')

# PCA to visualize clusters

In [None]:
df4pca = df[ocean]
# standardize first
df4pca_std = StandardScaler().fit_transform(df4pca)
# define 3D PCA
pc_model = PCA(n_components=3)
# apply PCA
pc = pc_model.fit_transform(df4pca_std)
# convert to data frame
df_pc = pd.DataFrame(data = pc, columns = ['pc_1', 'pc_2', 'pc_3'])
# add origin column
df_pc['cluster'] = df.cluster
# and look at result
df_pc.head()

In [None]:
# add PCA data to original data frame, so we have all data in one place
df['pc_1'] = df_pc.pc_1
df['pc_2'] = df_pc.pc_2
df['pc_3'] = df_pc.pc_3
df.head()

In [None]:
# interactive plot
fig = px.scatter_3d(df, x='pc_1', y='pc_2', z='pc_3',
                    color='cluster',
                    size='enjoy_watching',
                    hover_data=['userid'],
                    opacity=0.5)
fig.update_layout(title='PCA 3D')
fig.show()