In [8]:
import pandas as pd
import seaborn as sns
import os 
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# load data
df_dict = {}

for file in os.listdir("./final_data"):
   if file.endswith(".tsv"):
        filename = os.path.splitext(os.path.basename(file))[0]
        temp_df = pd.read_csv(os.path.join('./final_data', file), delimiter='\t')
        df_dict[filename] = temp_df

cost_df = df_dict['cost']

# since coffins column is missing values, add the coffin column from coffins.tsv
coffins_df = df_dict['coffins']
merged_df = cost_df.merge(coffins_df, left_on="coffins", right_on="total_cost", how="left")
cost_df["coffins"] = cost_df["coffins"].fillna(merged_df["coffins"])

# some of the missing values have different keys - filling in manually
# found this value in coffins_df
cost_df.at[0, "coffins"] = 590.0
# cannot find k&m in coffins_df, in paper, it says that Kha and Merit are similar to Yuya and Tuya, so filling their coffin value wih 590
cost_df.at[2, "coffins"] = 590.0

# filling in missing jewelry values
jewel_df = df_dict['jewel']
# putting 0 for iabtina because appendix in paper says there is no jewelry
cost_df.at[25, "jewlery"] = 0
# cannot find data for NuMan, putting 85 which is the average of the rank above and rank below cost on jewelry
cost_df.at[75, "jewlery"] = 85
cost_df = cost_df.rename(columns = {"jewlery" : "jewelry"})

# filling in missing values for profess
proff_df = df_dict['proff']
# no professional equipment for Sat-Re, filling in 0
cost_df.at[29, "profess"] = 0

# filling in missing values for toiletries 
# no information about toiletries for Khay, filling in the average of the rank above and below
cost_df.at[5, "toiletries"] = 13.5
# drop some repeat tombs
cost_df = cost_df[~cost_df['tomb'].isin(['Yuya','Kha', 'Merit', "Tuya", "T&A"])]

print(cost_df.isnull().sum())
cost_df = cost_df.drop(["rank"], axis=1)

tomb           0
rank           0
grand_total    0
amphorae       0
bouquet        0
bowls          0
boxbas         0
coffins        0
fertility      0
funerary       0
furniture      0
jars           0
jewelry        0
personal       0
profess        0
shabti         0
toiletries     0
vessels        0
dtype: int64


In [None]:
def assign_manual_label(row):
    tut = ["Tutankhamen"]
    elite = ["Y&T", "K&M", "Mahirper"]
    high_mid = ["Nakht", "Khay", "Hatnofer", "Neferkhewet", "Hatiay", "Setau", "S&N", "Petrie"]
    mid = ["Tahuty", "Ahotep", "Harmose", "Mentuhotep", "Senmut"]
    

    if row["tomb"] in tut:
        return "Tutankhamen"
    elif row["tomb"] in elite:
        return "Elite"
    elif row["tomb"] in high_mid:
        return "High-Middle"
    elif row["tomb"] in mid:
        return "Middle"
    else:
        return "Low"

# assign status from paper 
cost_df["status"] = cost_df.apply(assign_manual_label, axis=1)

In [10]:
# create group dataframes
elite_df = cost_df[cost_df["status"] == "Elite"]
high_mid_df = cost_df[cost_df["status"] == "High-Middle"]
mid_df = cost_df[cost_df["status"] == "Middle"]
low_df = cost_df[cost_df["status"] == "Low"]

df_list = [elite_df, high_mid_df, mid_df, low_df]


In [15]:
# PERMANOVA
from scipy.spatial.distance import pdist, squareform
from skbio.stats.distance import DistanceMatrix, permanova, permdisp

# scale features
no_tut = cost_df[cost_df['tomb'] != "Tutankhamen"]
feature_matrix = no_tut.drop(['tomb', 'status'], axis=1)
feature_matrix = StandardScaler().fit_transform(feature_matrix)

# calculate distance matrix
distances = pdist(feature_matrix, metric='euclidean')
dist_matrix = DistanceMatrix(squareform(distances), ids=no_tut["tomb"])

# permanova calculation
id_df = no_tut.set_index("tomb").loc[dist_matrix.ids, ["status"]]
pn = permanova(dist_matrix, grouping=id_df, column="status")
print(pn)
print('\n\n')

# check variance of columns 
grouped = cost_df.groupby("status")

for name, group in grouped:
    numeric_data = group.select_dtypes(include="number")  # filters only numeric columns
    zero_var_cols = (numeric_data.var() == 0).sum()
    print(f"Group: {name}")
    print(zero_var_cols, "columns with zero variance\n")


method name               PERMANOVA
test statistic name        pseudo-F
sample size                     133
number of groups                  4
test statistic            18.735675
p-value                       0.001
number of permutations          999
Name: PERMANOVA results, dtype: object



Group: Elite
2 columns with zero variance

Group: High-Middle
1 columns with zero variance

Group: Low
0 columns with zero variance

Group: Middle
8 columns with zero variance

Group: Tutankhamen
0 columns with zero variance



In [23]:
%load_ext rpy2.ipython
import os
os.environ["R_HOME"] = "/Library/Frameworks/R.framework/Resources"

%load_ext rpy2.ipython

import pandas as pd
from rpy2.robjects import pandas2ri
pandas2ri.activate()

# Extract status column into a separate Series
group_vector = id_df["status"]

# Then send both to R
%R -i feature_matrix -i group_vector

ValueError: openrlib.R_HOME cannot be None.

In [20]:
# Now in R: run betadisper and test
%%R
library(vegan)

# Compute Bray-Curtis distances
dist_matrix <- vegdist(your_dataframe, method = "euclidean")

# Run PERMDISP (using centroids)
dispersion <- betadisper(dist_matrix, group_vector, type = "centroid")

# Test for differences in dispersion
permdisp_result <- anova(dispersion)
print(permdisp_result)


UsageError: Line magic function `%%R` not found.


In [13]:
import rpy2.robjects as robjects

robjects.r('''
    library(vegan)
    dist_matrix <- vegdist(feature_matrix, method = "euclidean")
    dispersion <- betadisper(dist_matrix, group_vector, type = "centroid")
    result <- anova(dispersion)
    print(result)
''')


Unable to determine R home: [Errno 2] No such file or directory: 'R'


ValueError: openrlib.R_HOME cannot be None.