In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
pd.set_option('display.max_columns', 110)

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!cat /kaggle/input/machivallianism-test/MACH_data/codebook.txt

In [None]:
df = pd.read_csv("/kaggle/input/machivallianism-test/MACH_data/data.csv", sep="\t")
df = df[df["country"] == "US"]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
qcols = [x for x in df.columns if x.startswith("Q") and x.endswith("A")]
vcols = [x for x in df.columns if x.startswith("V")]
tcols = [x for x in df.columns if x.startswith("T")]
ocols = [x for x in df.columns if x[0] not in {"Q", "V", "T"}]
len(qcols), len(qcols), len(tcols), len(ocols)

In [None]:
df.describe()

In [None]:
sns.heatmap(df[qcols].corr(), cmap=sns.diverging_palette(220, 20, as_cmap=True), vmin=-1, vmax=1)

In [None]:
sns.heatmap(df[vcols].corr(), cmap=sns.diverging_palette(220, 20, as_cmap=True), vmin=-1, vmax=1)

In [None]:
sns.heatmap(df[tcols].corr(), cmap=sns.diverging_palette(220, 20, as_cmap=True), vmin=-1, vmax=1)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df[qcols+vcols+tcols].corr(), cmap=sns.diverging_palette(220, 20, as_cmap=True), vmin=-1, vmax=1)

In [None]:
from sklearn.decomposition import FactorAnalysis

In [None]:
n_comp = 2
famodel = FactorAnalysis(n_components=n_comp).fit(df[qcols+vcols+tcols])
data_fa3 = famodel.transform(df[qcols+vcols+tcols])
df_fa = pd.DataFrame(data_fa3)
df_con = pd.DataFrame(np.hstack([df[qcols+vcols+tcols].to_numpy(), data_fa3]), columns=qcols+vcols+tcols+list(range(n_comp)))

In [None]:
plt.figure(figsize=(20,3))
corrs = []
for i in range(n_comp):
    corrs.append(df_con.corrwith(df_con[i])[qcols+vcols+tcols])
g = sns.heatmap(corrs, cmap=sns.diverging_palette(220, 20, as_cmap=True), vmin=-1, vmax=1)
g.set_xticklabels(qcols+vcols+tcols, rotation=90)
None

In [None]:
# Top 10: sort by "warm" factor. 
df_con.sort_values(by=0, ascending=False).head(10)

In [None]:
# Worst 10: sort by "warm" factor. 
df_con.sort_values(by=0, ascending=True).head(10)

In [None]:
# Top 10: sort by "I don't know normal words" factor. 
df_con.sort_values(by=1, ascending=False).head(10)

In [None]:
# Worst 10: sort by "I don't know normal words" factor. 
df_con.sort_values(by=1, ascending=True).head(10)

In [None]:
df[0] = df_con[0]
df[1] = df_con[1]

In [None]:
#Top 10 by warm factor
df[ocols+[0,1]].sort_values(by=0, ascending=False).head(10)

In [None]:
# worst 10 warm factor
df[ocols+[0,1]].sort_values(by=0, ascending=True).head(10)

In [None]:
def factor_rank(target_column, size=100):
    tc = target_column
    vals = []
    colns = ["warm", "cold", "don't know", "know"]
    for i in range(2):
        for b in [False, True]:
            vals.append(
                df[ocols+[0,1]].sort_values(by=i, ascending=b).iloc[:size][tc])
    val_con = pd.DataFrame({colns[0]:[0 for _ in range(size)]})
    for cn, val in zip(colns, vals):
        val_con[cn] = val.tolist()
    return val_con

def print_factor_rank(val_con, target_column):
    tc = target_column
    xs = np.unique(val_con.to_numpy().tolist())
    for i in xs:
        flag = False
        for c in val_con.columns:
            v = sum(val_con[c] == i)
            if v != 0:
                if not flag:
                    print(f"[{tc}={i}]")
                    flag = True
                print(c, v)
        print()

In [None]:
target_column = "education"
size = 100 #top 100
print_factor_rank(factor_rank(target_column, size), target_column)

In [None]:
target_column = "major"
size = df.shape[0] // 50
print_factor_rank(factor_rank(target_column, size), target_column)