# There are K types of people in the world

## Load Libraries

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression

## Clean Data

In [None]:
# Get list of reversed questions: https://ipip.ori.org/new_ipip-50-item-scale.htm
reversed_items = ["EXT2", "EXT4", "EXT6", "EXT8", "EXT10", "EST1", "EST3", "EST6", "EST7", "EST8", "EST9", "EST10", "AGR1", "AGR3", "AGR5", "AGR7", "CSN2", "CSN4", "CSN6", "CSN8", "OPN2", "OPN4", "OPN6"]

In [None]:
raw_responses = pd.read_csv("../input/big-five-personality-test/IPIP-FFM-data-8Nov2018/data-final.csv", sep="\t")
responses = raw_responses.filter(regex='^(EXT|EST|AGR|CSN|OPN)[0-9]+$')
responses = responses.dropna(how='all')
responses = responses.replace(0, np.nan).dropna(how='any') # Note to Self: Try averaging the 0
responses[reversed_items] = 6 - responses[reversed_items] # Reverse Code the Reverse Coded Items
for cat in ['EXT', 'EST', 'AGR', 'CSN', 'OPN']:
    responses[cat] = responses[[cat + str(i) for i in range(1,11)]].sum(1)

In [None]:
mini_responses = responses.filter(["EXT", "EST", "AGR", "CSN", "OPN"])

## Identify Reasonable Number of Clusters with KMeans

In [None]:
responses_array = np.array(mini_responses)

kmeans_kwargs = {
    "init": "random",
    "n_init": 5,
    "max_iter": 300,
    "random_state": 42,
}

In [None]:
# A list holds the SSE values for each k
sse = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans_fit = kmeans.fit(responses_array)
    sse.append(kmeans_fit.inertia_)

In [None]:
sil

In [None]:
plt.plot(range(2, 11), sse)
plt.xticks(range(2, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

# Categorize

In [None]:
kmeans_model = KMeans(n_clusters = 5, **kmeans_kwargs)
kmeans_predictions = kmeans_model.fit_predict(responses_array)

In [None]:
kmeans_predictions

In [None]:
mini_responses["category"] = kmeans_predictions

In [None]:
mini_responses

In [None]:
fig, axs = plt.subplots(5, 5, figsize=(20,20))

cols = ["EXT", "EST", "AGR", "CSN", "OPN"]

for i in range(0,5):
    for j in range(0,5):
        x = mini_responses.loc[mini_responses.category == i, cols[j]]
        axs[i, j].hist(x, label=j)
        
        # Set a title, so we know which features we're looking at
        axs[i, j].set_title(f'Group {i} x {cols[j]}')

In [None]:
"Extraversion","Agreeableness","Conscientiousness","Emotional Stability","Openess"

In [None]:
mini_responses.groupby(["category"]).mean().reset_index()

In [None]:
target_summary = mini_responses.groupby(["category"]).mean().reset_index().melt(id_vars="category")
target_summary

In [None]:
target_summary.groupby(["category"]).plot.bar(x = "variable", y = "value")