# **Datasets Management**

# Import Relevant Libraries

In [18]:
# Import standard libraries for file and directory operations
import os
import fnmatch
import shutil

# Import data manipulation libraries
import pandas as pd  # Pandas for handling data in tabular form
import numpy as np   # NumPy for numerical operations
from itertools import product
from functools import reduce
from scipy.stats import percentileofscore

# Import a third-party library for natural sorting
from natsort import natsorted  # natsort for natural sorting of strings

# First Organization

Create a list of all directories.

In [19]:
# Get the current working directory
current_directory = os.getcwd()

# List all directories in the current directory
directories = [d for d in os.listdir(current_directory)
               if os.path.isdir(os.path.join(current_directory, d))]

# Display the number of folders in the current directory
print(f"There are {len(directories)} folders in the current directory, which are:")
print()

# Print each directory name on a new line
for dir in directories:
    print(dir)

There are 6 folders in the current directory, which are:

2011-2012
2015-2016
2007-2008
2009-2010
2005-2006
2013-2014


It seems nice. Now create a list of csv files' paths in the current directory.

In [20]:
def find_csv_files(root_dir):
    """
    Find .csv files in a directory and its subdirectories.

    Parameters:
    - root_dir (str): The root directory to start searching for .csv files.

    Returns:
    - List[str]: A list of full paths to .csv files found in the specified directory and its subdirectories.
    """

    # Initialize an empty list to store the file paths
    csv_files = []

    for root, dirs, files in os.walk(root_dir):
        # Walk through the directory tree rooted at root_dir.
        for filename in fnmatch.filter(files, "*.csv"):
            # For each file with a .XPT extension, add its full path to the xpt_files list.
            csv_files.append(os.path.join(root, filename))

    return csv_files

In [21]:
# Specify the current directory as the root directory.
current_directory = os.getcwd()

# Call the find_xpt_files function with the current directory.
csv_file_paths = find_csv_files(current_directory)

# Print each found .XPT file path on a new line.
for csv_file_path in csv_file_paths:
    print(csv_file_path)

/Users/shahriyar/Desktop/Study/SUT/Opartions Analytics/Final Project/Datasets/2011-2012/final_df_1112.csv
/Users/shahriyar/Desktop/Study/SUT/Opartions Analytics/Final Project/Datasets/2015-2016/final_df_1516.csv
/Users/shahriyar/Desktop/Study/SUT/Opartions Analytics/Final Project/Datasets/2007-2008/final_df_0708.csv
/Users/shahriyar/Desktop/Study/SUT/Opartions Analytics/Final Project/Datasets/2009-2010/final_df_0910.csv
/Users/shahriyar/Desktop/Study/SUT/Opartions Analytics/Final Project/Datasets/2005-2006/final_df_0506.csv
/Users/shahriyar/Desktop/Study/SUT/Opartions Analytics/Final Project/Datasets/2013-2014/final_df_1314.csv


Seems perfect. Now, we can read these files.</br>
Although it does not seem a good idea, read all of the files in the memory.

In [22]:
dataset_dict = {}

# Loop through each XPT file path to read it.
for csv_file_path in csv_file_paths:

    dataset_name = os.path.basename(csv_file_path).split(".csv")[0]
    dataset_dict[dataset_name] = pd.read_csv(csv_file_path)
    
# Iterate through each dataset in the dictionary to show their information.
for key in dataset_dict.keys():

    print(key)
    print(dataset_dict[key].shape)
    print()

final_df_1112
(520, 12)

final_df_1516
(471, 12)

final_df_0708
(539, 12)

final_df_0910
(559, 12)

final_df_0506
(1063, 12)

final_df_1314
(535, 12)



# Merge All of the Datasets

Now, for the latest steps merge all of the datsets.

In [23]:
# Sort the datasets in the dictionary
dataset_dict = dict(sorted(dataset_dict.items()))

# Concatenate the DataFrames vertically
concatenated_df = pd.concat(dataset_dict, ignore_index=True)

# Display the resulting DataFrame
print("Concatenated Dataset has the shape of:", concatenated_df.shape)

concatenated_df.head(5)

Concatenated Dataset has the shape of: (3687, 12)


Unnamed: 0,ID,Gender,Age,Race,BMI,Systolic BP,Diastolic BP,FPG,2hrPG,HbA1c,Total Cholestrol,Diabetes Risk
0,31133.0,2,16,4,16.79,120.0,58.0,84.0,122.0,4.7,147.0,2.0
1,31139.0,2,18,2,29.45,110.0,64.0,,,,,1.0
2,31141.0,1,16,1,18.55,122.0,62.0,,,,,2.0
3,31148.0,2,16,3,18.28,106.0,42.0,91.0,69.0,5.0,126.0,2.0
4,31163.0,1,17,4,18.75,,,76.0,,5.6,249.0,2.0


# Clean the Dataset

Firstly, remove all of the records with missing values.

In [24]:
# Drop all rows with missing values
cleaned_df = concatenated_df.dropna()

# Display the resulting DataFrame
print("Concatenated Dataset has the shape of:", cleaned_df.shape)

cleaned_df.head(5)

Concatenated Dataset has the shape of: (2743, 12)


Unnamed: 0,ID,Gender,Age,Race,BMI,Systolic BP,Diastolic BP,FPG,2hrPG,HbA1c,Total Cholestrol,Diabetes Risk
0,31133.0,2,16,4,16.79,120.0,58.0,84.0,122.0,4.7,147.0,2.0
3,31148.0,2,16,3,18.28,106.0,42.0,91.0,69.0,5.0,126.0,2.0
6,31206.0,2,17,5,21.27,110.0,66.0,93.0,82.0,4.7,180.0,1.0
7,31211.0,2,17,3,23.27,108.0,72.0,79.0,96.0,5.4,178.0,2.0
10,31237.0,1,12,2,32.01,120.0,50.0,103.0,111.0,4.9,175.0,1.0


Check the dataset discreption to remove the records containing "refused" values.</br>
The only feature that should be checked is Diabetes Risk feature that may contain the code 7 or 9, which are the "Refused" and "Don't know" values.</br>
In order to get sure if it contains any of these values, we should check for them.

In [25]:
print("The values seen in the Diabetes Risk feature:\n", list(cleaned_df["Diabetes Risk"].unique()))

The values seen in the Diabetes Risk feature:
 [2.0, 1.0, 9.0]


Now, it is necessary to remove the records containing these values.

In [26]:
print("The number of the records with the Diabetes Risk values equals to 9 is:", len(cleaned_df[cleaned_df["Diabetes Risk"] == 9]))

The number of the records with the Diabetes Risk values equals to 9 is: 13


Removing these records will fix all of the missing values problems.

In [27]:
cleaned_df = cleaned_df[cleaned_df["Diabetes Risk"] != 9]

# Make the Race feature values like the ones used in the paper

The Race feature values that are used in the model is somehow different from how they are shown in the NHANES dataset.

In [28]:
# Define the mapping
race_mapping = {
    3: 1,  # Non-Hispanic White
    4: 2,  # Non-Hispanic Black
    1: 3,  # Mexican American
    2: 3,  # Other Hispanic
    5: 4   # Other Race - Including Multi-Racial
}

# Replace values based on the mapping
cleaned_df["Race"] = cleaned_df["Race"].replace(race_mapping)

# Display the modified DataFrame
cleaned_df.head(5)

Unnamed: 0,ID,Gender,Age,Race,BMI,Systolic BP,Diastolic BP,FPG,2hrPG,HbA1c,Total Cholestrol,Diabetes Risk
0,31133.0,2,16,2,16.79,120.0,58.0,84.0,122.0,4.7,147.0,2.0
3,31148.0,2,16,1,18.28,106.0,42.0,91.0,69.0,5.0,126.0,2.0
6,31206.0,2,17,4,21.27,110.0,66.0,93.0,82.0,4.7,180.0,1.0
7,31211.0,2,17,1,23.27,108.0,72.0,79.0,96.0,5.4,178.0,2.0
10,31237.0,1,12,3,32.01,120.0,50.0,103.0,111.0,4.9,175.0,1.0


# Percentile the BMI Feature

Using the BMI continous percentile is important when using it in the ML model and also in the ADA screening guideline.

In [29]:
# Calculate percentiles using percentileofscore
cleaned_df["BMI Percentile"] = cleaned_df["BMI"].apply(lambda x: round(percentileofscore(cleaned_df["BMI"], x), 2))

cleaned_df.head(5)

Unnamed: 0,ID,Gender,Age,Race,BMI,Systolic BP,Diastolic BP,FPG,2hrPG,HbA1c,Total Cholestrol,Diabetes Risk,BMI Percentile
0,31133.0,2,16,2,16.79,120.0,58.0,84.0,122.0,4.7,147.0,2.0,3.63
3,31148.0,2,16,1,18.28,106.0,42.0,91.0,69.0,5.0,126.0,2.0,11.17
6,31206.0,2,17,4,21.27,110.0,66.0,93.0,82.0,4.7,180.0,1.0,37.62
7,31211.0,2,17,1,23.27,108.0,72.0,79.0,96.0,5.4,178.0,2.0,54.3
10,31237.0,1,12,3,32.01,120.0,50.0,103.0,111.0,4.9,175.0,1.0,89.14


# Define and Calculate the Patients' Hypertension Status

Since the Hypertension status will be used in both the ML model and the ADA screening guideline, here it will be calculated.

In [30]:
cleaned_df = cleaned_df.copy()

cleaned_df["Hypertension"] = (
    (cleaned_df["Systolic BP"] >= 120) &
    (cleaned_df["Diastolic BP"] >= 80)
).astype(int)

cleaned_df.head(5)

Unnamed: 0,ID,Gender,Age,Race,BMI,Systolic BP,Diastolic BP,FPG,2hrPG,HbA1c,Total Cholestrol,Diabetes Risk,BMI Percentile,Hypertension
0,31133.0,2,16,2,16.79,120.0,58.0,84.0,122.0,4.7,147.0,2.0,3.63,0
3,31148.0,2,16,1,18.28,106.0,42.0,91.0,69.0,5.0,126.0,2.0,11.17,0
6,31206.0,2,17,4,21.27,110.0,66.0,93.0,82.0,4.7,180.0,1.0,37.62,0
7,31211.0,2,17,1,23.27,108.0,72.0,79.0,96.0,5.4,178.0,2.0,54.3,0
10,31237.0,1,12,3,32.01,120.0,50.0,103.0,111.0,4.9,175.0,1.0,89.14,0


# Define and Calculate the ADA Screening and Guideline Labels

As we know, the main object of the project is to assess the performance of and guidelines tool with the ML models.</br>
So, finding the patients diabetes status for both of the tools is necessary.

The first step is to calculate the diabetic condition based on the ADA biomarker tool.

In [31]:
cleaned_df = cleaned_df.copy()

cleaned_df["ADA Biomarker Label"] = (
    (cleaned_df["FPG"] >= 100) |
    (cleaned_df["2hrPG"] >= 140) |
    (cleaned_df["HbA1c"] >= 5.7)
).astype(int)

cleaned_df.head(5)

Unnamed: 0,ID,Gender,Age,Race,BMI,Systolic BP,Diastolic BP,FPG,2hrPG,HbA1c,Total Cholestrol,Diabetes Risk,BMI Percentile,Hypertension,ADA Biomarker Label
0,31133.0,2,16,2,16.79,120.0,58.0,84.0,122.0,4.7,147.0,2.0,3.63,0,0
3,31148.0,2,16,1,18.28,106.0,42.0,91.0,69.0,5.0,126.0,2.0,11.17,0,0
6,31206.0,2,17,4,21.27,110.0,66.0,93.0,82.0,4.7,180.0,1.0,37.62,0,0
7,31211.0,2,17,1,23.27,108.0,72.0,79.0,96.0,5.4,178.0,2.0,54.3,0,0
10,31237.0,1,12,3,32.01,120.0,50.0,103.0,111.0,4.9,175.0,1.0,89.14,0,1


The next step is to calculate the diabetic condition based on the ADA screening tool.

In [32]:
cleaned_df = cleaned_df.copy()

cleaned_df["ADA Screening Label"] = (
    (cleaned_df["BMI Percentile"] >= 85) &
    ((cleaned_df["Diabetes Risk"] == 1) |
     (cleaned_df["Race"] != 3) | 
     (cleaned_df["Hypertension"] == 1) |
     (cleaned_df["Total Cholestrol"] >= 170))
).astype(int)

cleaned_df.head(5)

Unnamed: 0,ID,Gender,Age,Race,BMI,Systolic BP,Diastolic BP,FPG,2hrPG,HbA1c,Total Cholestrol,Diabetes Risk,BMI Percentile,Hypertension,ADA Biomarker Label,ADA Screening Label
0,31133.0,2,16,2,16.79,120.0,58.0,84.0,122.0,4.7,147.0,2.0,3.63,0,0,0
3,31148.0,2,16,1,18.28,106.0,42.0,91.0,69.0,5.0,126.0,2.0,11.17,0,0,0
6,31206.0,2,17,4,21.27,110.0,66.0,93.0,82.0,4.7,180.0,1.0,37.62,0,0,0
7,31211.0,2,17,1,23.27,108.0,72.0,79.0,96.0,5.4,178.0,2.0,54.3,0,0,0
10,31237.0,1,12,3,32.01,120.0,50.0,103.0,111.0,4.9,175.0,1.0,89.14,0,1,1


Save the final dataset.

In [34]:
cleaned_df.to_csv("final_dataset.csv",
                  index=False)