The data source and questions can be found here: https://www.kaggle.com/tunguz/big-five-personality-test#codebook.txt <br>
A brief analysis of the company that pulled the data: https://openpsychometrics.org/printable/big-five-personality-test.pdf

#### Imports

In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import geopandas as gpd
import pycountry

#### Querying the personality quiz results.

In [None]:
os.popen('cd ../input/big-five-personality-test/IPIP-FFM-data-8Nov2018; ls').read()
path = r'../input/big-five-personality-test/IPIP-FFM-data-8Nov2018/data-final.csv'
df_full = pd.read_csv(path, sep='\t')
pd.options.display.max_columns = 999
df_full.tail()

In [None]:
df = df_full.copy()

#### Removing rows with null values.

In [None]:
start_rows = len(df)
df = df.replace(0, np.nan).dropna(axis=0).reset_index(drop=True)
remove_rows = start_rows - len(df)

print('Information:\n')
print(f'Removed {remove_rows:,} rows that had incomplete pieces of data.')
print(f'This was {round(remove_rows/start_rows * 100,2)}% of the total data.')
print(f'\nNumber of countries: {len(set(df.country.values))}')

#### Incase we want to analyze the rows that we removed

In [None]:
df_zeros = df_full[(df_full == 0).any(1)]

#### Counting the quizes from each country

In [None]:
country_dict = {i.alpha_2: i.alpha_3 for i in pycountry.countries}
countries = (
  pd.DataFrame(df.country.value_counts())
  .T
  .drop('NONE', axis=1)
  .rename(columns=country_dict, index={'country': 'count'})
)

countries

#### Log scaling the country counts to make the map vizualization's color smoother; the US makes up over half of the quiz results.

In [None]:
countries_rank = countries.T.rename_axis('iso_a3').reset_index()
countries_rank['count_log'] = np.log(countries_rank['count'])
countries_rank['rank'] = countries_rank['count'].rank()
countries_rank.T

In [None]:
sns.set_style("white")

file = gpd.datasets.get_path('naturalearth_lowres')
world = gpd.read_file(file)
world = world[world.continent != 'Antarctica']
world_map = pd.merge(world, right=countries_rank, how='left', on='iso_a3').fillna(0)

fig, ax = plt.subplots(figsize=(20,10))
ax.set_xticks([])
ax.set_yticks([])
ax.set_title('Assessment Counts of each Country (log scaled)', size=16)
world_map.drop(159).plot(column='count_log', cmap='Blues', linewidth=0.8, ax=ax, edgecolor='0.8'); sns.set()
plt.box(on=None)

#### The questions are phrased in a way to add or subtract points from a particular personality traint
#### Consequently, we need to standardize the results by a simple remap.

In [None]:
pos_questions = [ # positive questions adding to the trait.
    'EXT1','EXT3','EXT5','EXT7','EXT9',
    'EST1','EST3','EST5','EST6','EST7','EST8','EST9','EST10',
    'AGR2','AGR4','AGR6','AGR8','AGR9','AGR10',
    'CSN1','CSN3','CSN5','CSN7','CSN9','CSN10',
    'OPN1','OPN3','OPN5','OPN7','OPN8','OPN9','OPN10',
]
neg_questions = [ # negative (negating) questions subtracting from the trait.
    'EXT2','EXT4','EXT6','EXT8','EXT10',
    'EST2','EST4',
    'AGR1','AGR3','AGR5','AGR7',
    'CSN2','CSN4','CSN6','CSN8',
    'OPN2','OPN4','OPN6',
]

df[pos_questions] = df[pos_questions].replace({1:-2, 2:-1, 3:0, 4:1, 5:2})
df[neg_questions] = df[neg_questions].replace({1:2, 2:1, 3:0, 4:-1, 5:-2})
cols = pos_questions + neg_questions
df = df[sorted(cols) + ['country']]
df.tail()

#### Now that we have the questions standardize, we can sum the results of each row.
#### For each of the 5 traits, the 10 question results add to one sum, resulting in the only 5 columns.

In [None]:
traits = ['EXT', 'EST', 'AGR', 'CSN', 'OPN']
trait_labels = ['Extroversion', 'Neuroticism', 'Agreeableness', 'Conscientiousness', 'Openness']

for trait in traits:
    trait_cols = sorted([col for col in df.columns if trait in col and '_E' not in col])
    df[trait] = df[trait_cols].sum(axis=1)

df = df.rename(columns={k:v for k,v in zip(traits, trait_labels)})
df[trait_labels].tail()

#### Now we can plot each trait's distribution.

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(18,9))
plt.subplots_adjust(left=None, bottom=None, right=None, top=1.3, wspace=None, hspace=None)
row = -1; col = 2
for i, trait in enumerate(trait_labels):
    if not i % 2:
        row += 1
    if not i % 2:
        col -= 2
    i += col
    sns.distplot(df[trait], ax=axs[row][i], axlabel='', kde=False, bins=40).set_title(trait, pad=10)
fig.delaxes(axs[2][1])

#### We can also see how each trait correlates with another.

In [None]:
df_corr = df[trait_labels].corr().round(3).replace(1, '')
for i in range(5):
    df_corr.iloc[i, i:] = ''
df_corr.iloc[1:, :-1]

#### Pairplots are useful to visualize correlations and distributions, too.
#### Although, these are expensive computation so we must use a sample of the data.

In [None]:
pp = sns.pairplot(df[trait_labels].sample(10000), diag_kind='kde', kind='reg', markers='', corner=True)
pp.fig.set_size_inches(10,10)

#### We can map each question's distribution as well.
#### The red color represents "strongly disagree" and green represents "strongly agree".
#### Since we remapped the questions, the green color means the response contributes points towards that trait.

In [None]:
plt.style.use('default')
fig, axs = plt.subplots(ncols=3, nrows=2, figsize=(18,3))
plt.subplots_adjust(left=None, bottom=None, right=None, top=3.5, wspace=None, hspace=None)
row = -1; col = 3
colors = ['#de425b', '#ec9c9d', '#f1f1f1', '#9fc08f', '#488f31']
for i, (trait, label) in enumerate(zip(traits, trait_labels)):
    if not i % 3:
        row += 1
    if not i % 3:
        col -= 3
    i += col
    trait_cols = sorted([col for col in cols if trait in col and '_E' not in col])
    trait_cols.pop(1)
    trait_cols += [trait +'10']
    g = df[trait_cols].apply(lambda col: col.value_counts()).T.plot(kind='bar', stacked=True, ax=axs[row][i], color=colors)
    g.set(yticklabels=[], title = label + ' Responses')
    g.legend_.remove()
fig.delaxes(axs[1][2])

#### We can plot the median values of each personality trait for all countries above 1,000 quizes complete.

In [None]:
top_countries = countries_rank[countries_rank['count'] > 1000].iso_a3

country_medians = df[trait_labels + ['country']].copy()
country_medians['country'] = country_medians['country'].replace(country_dict)
country_medians = country_medians[country_medians.country.isin(top_countries)].groupby('country').median()

min_val, max_val = min(country_medians.min()), max(country_medians.max())

country_medians = country_medians.reset_index().rename(columns={'country': 'iso_a3'})

In [None]:
world_map_medians = pd.merge(world, right=country_medians, how='left', on='iso_a3')

for trait in trait_labels:
    fig, ax = plt.subplots(figsize=(10,5))
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_title(f'\n\nMedian Values for {trait}', size=16)
    world_map_medians.fillna(0).plot(column=trait, cmap='Blues', linewidth=0.8, ax=ax, edgecolor='0.8', vmin=min_val, vmax=max_val); sns.set()
    plt.box(on=None)

## Question mappings

#### "P" means the prompt is positively phrased whereas "N" means it is negetively phrased.
#### This is why we had to remap the questions above.

|Label|Phrased|Prompt|
|:---|:----|:----|
|EXT1|P|I am the life of the party.|
|EXT2|N|I don't talk a lot.|
|EXT3|P|I feel comfortable around people.|
|EXT4|N|I keep in the background.|
|EXT5|P|I start conversations.|
|EXT6|N|I have little to say.|
|EXT7|P|I talk to a lot of different people at parties.|
|EXT8|N|I don't like to draw attention to myself.|
|EXT9|P|I don't mind being the center of attention.|
|EXT10|N|I am quiet around strangers.|
|EST1|P|I get stressed out easily.|
|EST2|N|I am relaxed most of the time.|
|EST3|P|I worry about things.|
|EST4|N|I seldom feel blue.|
|EST5|P|I am easily disturbed.|
|EST6|P|I get upset easily.|
|EST7|P|I change my mood a lot.|
|EST8|P|I have frequent mood swings.|
|EST9|P|I get irritated easily.|
|EST10|P|I often feel blue.|
|AGR1|N|I feel little concern for others.|
|AGR2|P|I am interested in people.|
|AGR3|N|I insult people.|
|AGR4|P|I sympathize with others' feelings.|
|AGR5|N|I am not interested in other people's problems.|
|AGR6|P|I have a soft heart.|
|AGR7|N|I am not really interested in others.|
|AGR8|P|I take time out for others.|
|AGR9|P|I feel others' emotions.|
|AGR10|P|I make people feel at ease.|
|CSN1|P|I am always prepared.|
|CSN2|N|I leave my belongings around.|
|CSN3|P|I pay attention to details.|
|CSN4|N|I make a mess of things.|
|CSN5|P|I get chores done right away.|
|CSN6|N|I often forget to put things back in their proper place.|
|CSN7|P|I like order.|
|CSN8|N|I shirk my duties.|
|CSN9|P|I follow a schedule.|
|CSN10|P|I am exacting in my work.|
|OPN1|P|I have a rich vocabulary.|
|OPN2|N|I have difficulty understanding abstract ideas.|
|OPN3|P|I have a vivid imagination.|
|OPN4|N|I am not interested in abstract ideas.|
|OPN5|P|I have excellent ideas.|
|OPN6|N|I do not have a good imagination.|
|OPN7|P|I am quick to understand things.|
|OPN8|P|I use difficult words.|
|OPN9|P|I spend time reflecting on things.|
|OPN10|P|I am full of ideas.|