In [9]:
# Import dependecies
import pandas as pd
import numpy as np
from random import sample
import matplotlib.pyplot as plt
import seaborn as sns

def create_subgroups(path, group_var, treatment_var, control_var):
    # Read the .csv file
    data = pd.read_csv(path)

    # Group_by segregrates the .csv file into control and treatment
    group_by = group_var

    # New dataframe grouped by control-group
    group_control = data[data[group_by] == control_var].reset_index(drop = True)

    # New dataframe grouped by treatment-group
    group_treatment = data[data[group_by] == treatment_var].reset_index(drop = True)
    
    return group_control, group_treatment

def match(control, treatment, matching_number):
    # Creates temporary cache files for the function
    # NO NEED TO EDIT HERE
    control_main, control_cache = control, control
    treatment_main, treatment_cache = treatment, treatment
    
    # Matched contains indices pair for matched control-group and treatment-group
    # E.g., matched = [[1,2],[3,6],[9,12], ...], 
    # where, 1,3,9 are the indices from control-group dataframe
    # and, 2,6,12 are the indices from treatment-group dataframe
    matched = []

    # Loop through the treatment-group dataframe
    for k in range(len(treatment_cache)):
        # Get age and gender for each subject in treatment-group dataframe
        get_age, get_gender = treatment_cache.Age.iloc[k], treatment_cache.Gender.iloc[k]

        # Using the extracted age and gender from treatment group, match with the control-group dataframe
        sampled = control_cache[(control_cache.Age == get_age) & (control_cache.Gender == get_gender)]

        # Possible scenarios:
        # Situation 1: If no matches found, that is, len(sampled) = 0 then, that subject in treatment group will be dropped.
        # Situation 2: If matches found, that is, len(sampled) > matching number, then only sample n = matching number to
        # maintain a ratio of 1:matching_number for treatment:control. 
        if len(sampled) > matching_number:
            sampler = sampled.sample(n=matching_number)
            matched += [list(sampler.index) + [k]]

            # After finding the match, drop the subject from treatment group indicating that match for the particular
            # subject has been found, and there is no need to find another match, since we have to maintain 1:matchin_number ratio.
            control_cache = control_cache.drop(index = sampler.index[0])

    # Convert the matched list into np.array to access the indices 
    matched = np.array(matched)

    # Assign the first column to control-group
    control_matched = control_main.iloc[matched[:,0]]

    # Assign the second column to treatment-group
    treatment_matched = treatment_main.iloc[matched[:,1]]

    # Create the matched-dataframe
    matched_df = pd.concat([control_matched, treatment_matched]).reset_index(drop=True)
    
    return matched_df

In [None]:
control, treatment = create_subgroups(path = "path_to_your_csv_file",
                                      group_var = "grouping_variable_column_name",
                                      control_var = "your_control_group_name",
                                      treatment_var = "your_treatment_group_name")

matched_df = match(control, treatment, matching_number=1)