In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from mpl_toolkits.mplot3d import Axes3D 

# Big Five Survey Data Exploratory analysis and Hypothesis Testing
> This notebook contains some basic EDA for the big five dataset.
In the course of the analysis, we notice some trends and perform statistical tests to formalize our hypothesis.
This is still a work-in-progress, and a lot more can be added.
I am making this public as there are many open questions and subject-matter based assumptions, and it would be great to get suggestions/insights from you on the work so far and what else can be done.

In [None]:
df = pd.read_csv('../input/big-five-personality-test/IPIP-FFM-data-8Nov2018/data-final.csv', sep='\t')

# adjusting scores for questions with negative/opposite measurement, list of negatives taken from:

negatives = [ 
    'EXT2','EXT4','EXT6','EXT8','EXT10', # 5
    'EST2','EST4',                       # 2
    'AGR1','AGR3','AGR5','AGR7',         # 4
    'CSN2','CSN4','CSN6','CSN8',         # 4
    'OPN2','OPN4','OPN6',                # 3
]

df[negatives] = df[negatives].replace({1:5, 2:4, 3:3, 4:2, 5:1})

a=range(1,11)
ext_cols = [('EXT'+str(i)) for i in a]
est_cols = [('EST'+str(i)) for i in a]
agr_cols = [('AGR'+str(i)) for i in a]
csn_cols = [('CSN'+str(i)) for i in a]
opn_cols = [('OPN'+str(i)) for i in a]

traits = [ext_cols, est_cols, agr_cols, csn_cols, opn_cols]

df_scores = pd.DataFrame(index = df.index)

# Sum scores to calculate scores for each trait

for trait in traits:
    df_scores = pd.concat([df_scores,(df.loc[:,trait]).sum(axis=1)], axis = 1)
    
df_scores.columns=['EXT', 'EST', 'AGR', 'CSN', 'OPN']




df_scores = pd.concat([df_scores, df.country], axis = 1)

df_scores.head()

Create histogram to check distribution of scores, we would expect close to normal

In [None]:


hist_plots = plt.figure(figsize=(30,20))

for i in range(5):
    plt.subplot(2,3,i+1)
    plt.title(label=df_scores.columns[i],fontsize=30)
    sns.distplot(df_scores.iloc[:,i], axlabel=False, kde=False)

hist_plots.suptitle('Histograms for each trait', fontsize=30)
plt.show()

Distributions are more or less normal for all traits, as expected.
There are few major ourliers, specially in the lower end of the distributions, but with very low frequencies. Nothing too surprisingly, but may need a closer look later.

We may also with to check distribution of means of scores by country.
This should also be generally close to normal

In [None]:
scores_by_country = df_scores.groupby('country').mean()
scores_by_country.head()
country_distplots = plt.figure(figsize=(30,20))
for i in range(5):
    plt.subplot(2,3,i+1)
    colname = scores_by_country.columns[i]
    plt.title(label=colname,fontsize=30)
    sns.distplot(scores_by_country.iloc[:,i], axlabel=False, kde=False)

plt.show()

There are some countries with very low / very high average score.
This would be strange, I expect this is because we have very few observations from some of the countries, but I am leaving the confirmation as a //todo for now :)
List of countries with max/min scores per trait can be checked below.

In [None]:
for trait in scores_by_country:
    print("countries with maximum average scores for ", trait)
    print(scores_by_country.loc[:,trait].sort_values()[0:5])

We can now check for collinearity in the traits, and examine visually using scatterplots.
I am not sure what to expect here, would need some SME for this. Are there traits generally considered independent, or is there some correlation?
For example people who are less extroverted are generally also less open, or something similar?

We can try to find any such pattern in this data

In [None]:
from scipy.stats import pearsonr

r, p = pearsonr(df_scores.EXT, df_scores.EST)
print(r,p)

df_scores.iloc[:,0:5].corr()





In [None]:
# Correlation values are quite low, but lets plot some scatter plots and see what it looks like:

In [None]:
scatter_plots = plt.figure()
sns.scatterplot(x=df_scores.AGR, y=df_scores.EXT)
plt.show()

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df_scores.EXT, df_scores.EST, df_scores.OPN)
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.show()

While numerical correlation values maybe low, there is clearly something going on.
The values are more or less independent for higher scores, but there is a distinct cluster for lower values.
The data seems to suggest that people who score low on one trait, tend to score low on ALL traits! Any such relation breaks down as scores increase.
(I have displayed only one set of scatter plots for brevity, but they look similar for all combinations)
This does not sound too intuitive, but I'm not a psychologist!

There could be other unaccounted reasons for this, like how the participants percieve the questions, but for now we can ignore that and focus on analytics.

So lets perform some statistical tests to see if the effects we infer from the scatterplots are statistically significant in any way.

Since there is no correlation for larger values, it might be a good starting point to separate the traits into 'low', 'mid', and 'high' scores and perform ANOVA.
Arbitrarily, i choose zscores = 2 and -2 as thresholds.

WE will also plot number of negative outliers per trait, to see if there is a particular trait with very large number ( though we can kind of see from the histograms that they are all close to the same distribution)

In [None]:
import numpy as np
#count outliers in each trait


zscores = pd.DataFrame(index = df_scores.index)


for trait in list(df_scores.columns[0:5]):
    
    zscores[trait] = (df_scores[trait] - df_scores[trait].mean()) / df_scores[trait].std(ddof=0)
    
# zscores = pd.concat([zscores, df_scores.country], axis = 1)


is_neg_outlier = (zscores < -2)
count_outliers = is_neg_outlier.apply(np.count_nonzero)
count_outliers.columns=['Trait','Negative_Outliers']
print(count_outliers)

## Number of outliers per 
count_outliers.plot.bar(x='Trait', y='Negative_Outliers')

is_neg_outlier = pd.concat([is_neg_outlier, df_scores.country], axis = 1)

In [None]:
# Prepare data for anova, lets start with categorizing EXT


conditions = [zscores['EXT']<-2, zscores['EXT']>2]
choices = ['low', 'high']




ext_cat = pd.Series(np.select(conditions, choices, default = 'mid'))


ext_anova = pd.DataFrame(index = zscores.index)
 

# EXT_CAT.head()

ext_anova = pd.concat([ext_cat, zscores[['EST','AGR','CSN','OPN']]], axis=1)
ext_anova.columns = ['EXT_CAT','EST','AGR','CSN','OPN']
ext_anova.head()


In [None]:
from statsmodels.formula.api import ols

results = ols('AGR ~ C(EXT_CAT)', data=ext_anova).fit()
results.summary()

Yes, quite a significant result.
Can also do Multivariate ANOVA as it seems the effect exists across traits (//todo)

Further tests... :

We can convert all traits to categoricals in the same way ...

In [None]:
df_chisq = pd.DataFrame(index = zscores.index)
traits = ['EXT', 'EST', 'AGR', 'CSN', 'OPN']
for trait in traits:
    colname = trait + '_CAT'
    conditions = [zscores[trait]<-1.5, zscores[trait]>1.5]
    choices = ['low', 'high']
    df_chisq[colname] = pd.Series(np.select(conditions, choices, default = 'mid'), name = colname)
df_chisq.describe()

# Then perform chi square, also \\todo for now :(