In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/big-five-personality-test/IPIP-FFM-data-8Nov2018/data-final.csv', sep='\t')
df = df[(df['IPC'] == 1)] #cleaning repeaters (same IP)
df = df.dropna()

In [None]:
pos_questions = [
    'EXT1','EXT3','EXT5','EXT7','EXT9',                      
    'EST1','EST3','EST5','EST6','EST7','EST8','EST9','EST10', 
    'AGR2','AGR4','AGR6','AGR8','AGR9','AGR10',              
    'CSN1','CSN3','CSN5','CSN7','CSN9','CSN10',               
    'OPN1','OPN3','OPN5','OPN7','OPN8','OPN9','OPN10',        
]
neg_questions = [ 
    'EXT2','EXT4','EXT6','EXT8','EXT10', 
    'EST2','EST4',                       
    'AGR1','AGR3','AGR5','AGR7',         
    'CSN2','CSN4','CSN6','CSN8',         
    'OPN2','OPN4','OPN6',                
]

In [None]:
data = df.drop(df.columns[50:107], axis=1, inplace=False)
data = data.drop(data.columns[51:], axis=1, inplace=False)
data.head()

In [None]:
# data.head()
data[pos_questions] = data[pos_questions].replace({1:-2, 2:-1, 3:0, 4:1, 5:2})
data[neg_questions] = data[neg_questions].replace({1:2, 2:1, 3:0, 4:-1, 5:-2})
data.head()

In [None]:
traits = ['EXT', 'EST', 'AGR', 'CSN', 'OPN']
for trait in traits:
    new_col = str(trait)
    cols = [col for col in data.columns if (trait in col) and ('_E' not in col)]
    data[new_col] = data[cols].sum(axis=1)/10
data[traits].head()

In [None]:
data = data.drop(data.columns[:50], axis=1, inplace=False)
data.head()

In [None]:
countries = pd.DataFrame(data['country'].value_counts())
countries_2500 = countries[countries['country'] >= 2500]
print(countries_2500.size)
plt.figure(figsize=(15,5))
sns.barplot(data=countries_2500, x=countries_2500.index, y='country')
plt.title('Countries with 2500 or More Participants')
plt.ylabel('Participants');

In [None]:
print(countries_2500)

In [None]:
means_traits = pd.DataFrame(columns = traits)
for country in countries_2500.index: 
#     print(country)
    means = {}
    for trait in traits:
        means[trait] = (data[data['country'] == country])[trait].mean()
    means_traits.loc[country] = means
means_traits['country'] = countries_2500.index    
#     print(means)
print(means_traits)  

In [None]:
plt.figure(figsize=(20,12))
sns.barplot(x='country',
            y='EXT', 
            data=means_traits, 
            order=means_traits.sort_values('EXT').country)
plt.figure(figsize=(20,12))
sns.barplot(x='country',
            y='EST', 
            data=means_traits, 
            order=means_traits.sort_values('EST').country)
plt.figure(figsize=(20,12))
sns.barplot(x='country',
            y='AGR', 
            data=means_traits, 
            order=means_traits.sort_values('AGR').country)
plt.figure(figsize=(20,12))
sns.barplot(x='country',
            y='CSN', 
            data=means_traits, 
            order=means_traits.sort_values('CSN').country)
plt.figure(figsize=(20,12))
sns.barplot(x='country',
            y='OPN', 
            data=means_traits, 
            order=means_traits.sort_values('OPN').country)

In [None]:
for i in range(0, 32):
    means_traits.iloc[i:i+1:].plot.bar()