In [8]:
import pandas as pd

# Load the dataset
file_path = './data/dementia_data-MRI-features.csv'
data = pd.read_csv(file_path)

# Step 1: Drop rows with missing values in 'SES' and 'MMSE' columns
data_cleaned = data.dropna(subset=['SES', 'MMSE'])

# Step 2: Remove unnecessary columns ('Subject ID', 'MRI ID', 'Hand')
data_cleaned = data_cleaned.drop(columns=['Subject ID', 'MRI ID', 'Hand'])

# Encode 'Group' column: 'Nondemented' -> 0, 'Demented' -> 1
data_cleaned['Group'] = data_cleaned['Group'].map({'Nondemented': 0, 'Demented': 1, 'Converted':2})

# Encode 'M/F' column: 'M' -> 0, 'F' -> 1
data_cleaned['M/F'] = data_cleaned['M/F'].map({'M': 0, 'F': 1})
data_cleaned['SES'] = data_cleaned['SES'].astype(int)
data_cleaned['MMSE'] = data_cleaned['MMSE'].astype(int)


# Display the first few rows to verify encoding
print(data_cleaned.head())

# Display the cleaned data's structure and the first few rows to verify
print(data_cleaned.info())
print(data_cleaned.head())



   Group  Visit  MR Delay  M/F  Age  EDUC  SES  MMSE  CDR  eTIV   nWBV    ASF
0      0      1         0    0   87    14    2    27  0.0  1987  0.696  0.883
1      0      2       457    0   88    14    2    30  0.0  2004  0.681  0.876
5      0      1         0    1   88    18    3    28  0.0  1215  0.710  1.444
6      0      2       538    1   90    18    3    27  0.0  1200  0.718  1.462
7      0      1         0    0   80    12    4    28  0.0  1689  0.712  1.039
<class 'pandas.core.frame.DataFrame'>
Index: 354 entries, 0 to 372
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Group     354 non-null    int64  
 1   Visit     354 non-null    int64  
 2   MR Delay  354 non-null    int64  
 3   M/F       354 non-null    int64  
 4   Age       354 non-null    int64  
 5   EDUC      354 non-null    int64  
 6   SES       354 non-null    int32  
 7   MMSE      354 non-null    int32  
 8   CDR       354 non-null    float64
 9 

In [9]:
from sklearn.model_selection import train_test_split

# Define features and target variable
X = data_cleaned.drop(columns=['Group'])
y = data_cleaned['Group']

# Perform 80:20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Combine features and target for saving
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

# Save to CSV files
train_data.to_csv('./data/dementia_data_train.csv', index=False)
test_data.to_csv('./data/dementia_data_test.csv', index=False)

print("Train and test datasets saved as 'dementia_data_train.csv' and 'dementia_data_test.csv'")


Train and test datasets saved as 'dementia_data_train.csv' and 'dementia_data_test.csv'
