# GeneMapper2Familias

## Importing packages

In [None]:
import os
import glob
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)

## Uploading data

In [None]:
def upload_genemapper(input_path):
    """
    Loads and structures data from GeneMapper text files into a DataFrame.
    
    Parameters:
    - input_path (str): Path to the input folder containing the text files.

    Returns:
    - pd.DataFrame: Structured DataFrame with columns ['Sample Name', 'Marker', 'Allele 1', 'Allele 2'].
    """
    df_aux = []
    
    # Reading files
    for file in os.listdir(input_path):
        if file.endswith('.txt'):
            txt_file_path = os.path.join(input_path, file)
            
            try:
                with open(txt_file_path, "r") as infile:
                    list_line_by_line = [line.strip() for line in infile]
                    
                # Processing each line in the file
                for line in list_line_by_line:
                    elements = line.split('\t')
                    
                    # Ensure that the line contains exactly four elements
                    if len(elements) == 4:
                        df_aux.append(elements)
                    else:
                        print(f"Skipping malformed line in file {file}: {line}")
                        
            except Exception as e:
                print(f"Error processing the file {file}: {e}")
                continue  # Continue processing the next file if there's an error
    
        # Structuring the DataFrame
        df = pd.DataFrame(df_aux)

        # Set the column names directly
        df.columns = ["Sample Name", "Marker", "Allele 1", "Allele 2"]
        
        # Replace remaining NaN values with actual NaN (not None or other placeholders)
        df.fillna(np.nan, inplace=True)

        # Drop columns that are entirely empty (all values are NaN)
        df.dropna(how="all", axis=1, inplace=True)

        # Drop rows that are completely empty (all values are NaN)
        df.dropna(how="all", axis=0, inplace=True)
        
        # Drop rows where the 'Marker' column is missing (NaN values in 'Marker')
        df.dropna(subset=["Marker"], inplace=True)

        # Remove rows where the 'Sample Name' column contains the header value (which could be a duplicate header in the data)
        df = df[df['Sample Name'] != "Sample Name"]

        # Strip leading/trailing whitespace from all string columns
        df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

        # Replace occurrences of 'AM' with 'AMEL' in the 'Marker' column for standardization
        df["Marker"].replace("AM", "AMEL", inplace=True)

        # Fill missing values in the 'Allele 2' column with the values from 'Allele 1' if 'Allele 2' is NaN
        df["Allele 2"] = df["Allele 2"].fillna(df["Allele 1"])

    return df

# Apply function
input_path = os.path.join(os.getcwd(), "Input")
df = upload_genemapper(input_path)

## Data Structure

In [None]:
def convert_to_horizontal_format(df):
    """
    Converts a DataFrame from vertical GeneMapper format to horizontal format.

    Parameters:
    - df (pd.DataFrame): Input DataFrame in vertical format with columns ['Sample Name', 'Marker', 'Allele 1', 'Allele 2'].

    Returns:
    - pd.DataFrame: DataFrame in horizontal format with alleles for each marker as separate columns.
    """
    # Copy and rename relevant columns
    df_h = df.copy()
    df_h.columns = ["Sample id", "Marker", "Allele 1", "Allele 2"]

    # Set 'Sample id' and 'Marker' as index for unstacking
    df_h = df_h.set_index(["Sample id", "Marker"])

    # Unstack and sort by 'Sample id'
    df_h = df_h.unstack().sort_values(["Sample id"])

    # Swap the levels in the columns for correct ordering
    df_h = df_h.swaplevel(0, 1, 1).sort_index(1)

    # Rename 'Allele 1' and 'Allele 2' to '1' and '2'
    df_h = df_h.rename({"Allele 1": "1", "Allele 2": "2"}, axis=1)

    # Flatten the column MultiIndex
    df_h.columns = [' '.join(col) for col in df_h.columns.values]
    
    # Reset index
    df_h.reset_index(inplace = True)

    return df_h

# Apply function
df_horizontal = convert_to_horizontal_format(df)

## Pedigree structure

In [None]:
df_horizontal["Family id"] = df_horizontal["Sample id"].apply(lambda x: x.split("-")[0])
df_horizontal["Sample id"] = df_horizontal["Sample id"].apply(lambda x: f"{x.split('-')[1]}-{x.split('-')[2]}")

## Adding the basic pedigree structrure

In [None]:
relationship = ["1-PGF", "2-PGM", "3-MGF", "4-MGM", "5-FATHER", "6-MOTHER"]
xy = dict(zip(relationship, ["Y", "X", "Y", "X", "Y", "X"]))

def add_basic_pedigree_structure(df):
    """
    Ensures that each family in the DataFrame includes all members of the basic pedigree structure,
    and fills missing values in the 'AMEL 2' column based on predefined gender mapping.

    Parameters:
    - df (pd.DataFrame): Input DataFrame with columns 'Family id' and 'Sample id'.

    Returns:
    - pd.DataFrame: Updated DataFrame containing all required members for each family, with
      missing 'AMEL 2' values filled according to the 'Sample id' mapping.
    
    Notes:
    - The basic pedigree structure is defined by the 'relationship' list, which contains IDs for:
      ["1-PGF", "2-PGM", "3-MGF", "4-MGM", "5-FATHER", "6-MOTHER"].
    - If any of these IDs are missing for a family, they are added with placeholder values.
    - The 'AMEL 2' column is filled with 'Y' or 'X' based on the 'Sample id' values mapped in 'xy'.
    """
    family_ids = df["Family id"].unique().tolist()
    
    for family in family_ids:
        # Extract samples for the current family
        samples = df[df["Family id"] == family]
        
        # Identify existing and missing members in the family
        existing_ids = samples["Sample id"].unique().tolist()
        missing_ids = [member for member in relationship if member not in existing_ids]
        
        # Prepare new rows for missing members
        new_rows = pd.DataFrame({
            "Family id": [family] * len(missing_ids),
            "Sample id": missing_ids,
            "AMEL 1": ["X"] * len(missing_ids),  # Placeholder for missing values in 'AMEL 1'
            "AMEL 2": [xy[member] for member in missing_ids]  # Fill 'AMEL 2' based on gender mapping
        })
        
        # Concatenate the new rows to the original DataFrame
        df = pd.concat([df, new_rows], ignore_index=True)
    
    # Fill missing values in 'AMEL 2' for existing entries using the 'xy' mapping
    df["AMEL 2"] = df["AMEL 2"].fillna(df["Sample id"].map(xy))
    
    return df

# Apply function
updated_df = add_basic_pedigree_structure(df_horizontal)

## Adding [Relation] column

In [None]:
# Define function to map relationship codes to descriptive labels
def relation(sample_id):
    """
    Extracts the relationship code from 'Sample id' and maps it to a descriptive label.

    Parameters:
    - sample_id (str): The 'Sample id' from which to extract the relationship code.

    Returns:
    - str: A descriptive label for the relationship, or NaN if the code is not recognized.
    
    This function uses a predefined dictionary to convert abbreviated relationship codes
    into full descriptions. For instance, "PGF" becomes "[Paternal grandfather]" and 
    "MOTHER" becomes "[Mother]". If the code does not match any in the dictionary, 
    the function returns NaN.
    """
    # Define mapping of relationship codes to descriptions (related to the Missing Person)
    relation_map = {
        "PGF": "[Paternal grandfather]",
        "PGM": "[Paternal grandmother]",
        "MGF": "[Maternal grandfather]",
        "MGM": "[Maternal grandmother]",
        "FATHER": "[Father]",
        "MOTHER": "[Mother]",
        "BROTHER": "[Brother]",
        "SISTER": "[Sister]",
        "MHB": "[Maternal half brother]",
        "PHB": "[Paternal half brother]",
        "MHS": "[Maternal half sister]",
        "PHS": "[Paternal half sister]",
        "PU": "[Uncle]",
        "PA": "[Aunt]",
        "SON": "[Child]",
        "DAUGHTER": "[Child]"
    }
    
    # Extract relationship code from 'Sample id'
    relationship_code = sample_id.split("-")[-1]
    
    # Return the mapped description or NaN if code is not found
    return relation_map.get(relationship_code, np.nan)

# Apply function
updated_df["[Relation]"] = updated_df["Sample id"].apply(relation)

## Reordering

In [None]:
def sort_members(df):
    """
    Sorts the DataFrame by family relationships, giving precedence to the paternal branch.
    
    Parameters:
    - df (pd.DataFrame): Input DataFrame containing 'Family id' and 'Sample id' columns.

    Modifies:
    - Adds 'Relationship ID' as a categorical column to prioritize relationships based on the paternal branch.
    - Adds 'Pedigree ID' and 'Family Order' columns to capture the numeric order from 'Sample id' and 'Family id'.

    Returns:
    - pd.DataFrame: Sorted DataFrame with prioritization based on the specified relationship order.
    """
    # Define prioritization order for family relationships, with a preference for the paternal branch
    order_relationship = ["FATHER", "PGF", "PGM", "MOTHER", "MGF", "MGM"]
    
    # Extract relationship code from 'Sample id'
    df["Relationship ID"] = df["Sample id"].apply(lambda x: x.split("-")[-1])
    
    # Categorize 'Relationship ID' according to the defined order
    df["Relationship ID"] = pd.Categorical(df["Relationship ID"], 
                                           categories=order_relationship,
                                           ordered=True)

    # Extract numeric order from 'Sample id' to assist in sorting within families
    df["Pedigree ID"] = df["Sample id"].apply(lambda x: x.split("-")[0])
    df["Pedigree ID"] = pd.to_numeric(df["Pedigree ID"], errors='coerce').fillna(0)
    
    # Extract the numeric part of 'Family id' for proper sorting
    df["Family Order"] = df["Family id"].str.extract(r'(\d+)', expand=False).astype(int)
    
    # Sort by 'Family Order', 'Relationship ID' (categorical order), and 'Pedigree ID'
    df.sort_values(["Family Order", "Relationship ID", "Pedigree ID"], inplace=True)
    
    # Drop the helper columns
    df.drop(["Relationship ID", "Pedigree ID", "Family Order"], axis=1, inplace=True)
    
    return df

# Apply function
df_sorted = sort_members(updated_df)

## Exporting

In [None]:
def export_family_pedigree(df):
    """
    Exports the DataFrame to a .txt file with a specific column order and encoding.

    Parameters:
    - df (pd.DataFrame): DataFrame to export, containing 'Family id', '[Relation]', 'Sample id', 'AMEL 1', 'AMEL 2' columns.

    File Output:
    - A .txt file is created in the 'Output' directory with the name format "Family Pedigree - [Relation].txt".
      The file is saved with 'Family id' as the index, tab-separated.
    """
    # Ensure 'Output' directory exists
    os.makedirs("Output", exist_ok=True)

    # Define the column order
    primary_columns = ['Family id', '[Relation]', 'Sample id', 'AMEL 1', 'AMEL 2']
    all_columns = primary_columns + [col for col in df.columns if col not in primary_columns]
    
    # Reorder DataFrame columns
    df = df[all_columns]
    
    # Create filename with current date
    file_name = f"Family Pedigree - [Relation].txt"
    
    # Export to .txt file with specified encoding and format
    df.set_index("Family id").to_csv(os.path.join("Output", file_name), sep="\t")
    print(f"File exported successfully to 'Output/{file_name}'")

# Apply function
export_family_pedigree(df_sorted)