In [2]:
import pandas as pd
import seaborn as sns
import os 
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

df_dict = {}

for file in os.listdir("./final_data"):
   if file.endswith(".tsv"):
        filename = os.path.splitext(os.path.basename(file))[0]
        temp_df = pd.read_csv(os.path.join('./final_data', file), delimiter='\t')
        df_dict[filename] = temp_df

# set up cost dataframe for kmeans clustering 
cost_df = df_dict['cost']

# since coffins column is missing values, add the coffin column from coffins.tsv
coffins_df = df_dict['coffins']
merged_df = cost_df.merge(coffins_df, left_on="coffins", right_on="total_cost", how="left")
cost_df["coffins"] = cost_df["coffins"].fillna(merged_df["coffins"])

# some of the missing values have different keys - filling in manually
# found this value in coffins_df
cost_df.at[0, "coffins"] = 590.0
# cannot find k&m in coffins_df, in paper, it says that Kha and Merit are similar to Yuya and Tuya, so filling their coffin value wih 590
cost_df.at[2, "coffins"] = 590.0

# filling in missing jewelry values
jewel_df = df_dict['jewel']
# putting 0 for iabtina because appendix in paper says there is no jewlery (but jewlery table in paper has miscellaneus items for her? not sure)
cost_df.at[25, "jewlery"] = 0
# cannot find data for NuMan, putting 85 which is the average of the rank above and rank bewlos cost on jewelry
cost_df.at[75, "jewlery"] = 85
cost_df = cost_df.rename(columns = {"jewlery" : "jewelry"})

# filling in missing values for profess
proff_df = df_dict['proff']
# no professional equipment for Sat-Re, filling in 0
cost_df.at[29, "profess"] = 0

# filling in missing values for toiletries 
# no information about toiletries for Khay, filling in the average of the rank above and below
cost_df.at[5, "toiletries"] = 13.5
# drop some repeat tombs
cost_df = cost_df[~cost_df['tomb'].isin(['Yuya','Kha', 'Merit', "Tuya"])]


feature_matrix = cost_df.drop(['tomb', 'rank'], axis=1)
feature_matrix = StandardScaler().fit_transform(feature_matrix)


In [None]:
def assign_manual_label(row):
    tut = ["Tutankhamen"]
    elite = ["Y&T", "K&M", "Horemheb", "Ramose", "Mahirper"]
    high_mid = ["Nakht", "Khay", "Hatnofer", "Nerferkhewet", "Hatiay", "Setau", "Sennofer"]
    mid = ["Tahuty", "Ahotep", "Harmose", "Mentuhotep", "Senmut", "Petrie"]
    

    if row["tomb"] in tut:
        return "Tutankhamen"
    elif row["tomb"] in elite:
        return "Elite"
    elif row["tomb"] in high_mid:
        return "High-Middle"
    elif row["tomb"] in mid:
        return "Middle"
    else:
        return "Low"
    
cost_df["manual_label"] = cost_df.apply(assign_manual_label, axis=1)