In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# --- 1. Load the Datasets ---
# Load the training and testing data from the uploaded CSV files.
try:
    train_df = pd.read_csv('Train_Data.csv')
    test_df = pd.read_csv('Test_Data.csv')
    sample_submission_df = pd.read_csv('Sample_Submission.csv')
except FileNotFoundError:
    print("Ensure 'Train_Data.csv', 'Test_Data.csv', and 'Sample_Submission.csv' are uploaded to your Colab environment.")
    # Create dummy dataframes to prevent further errors if files are not found
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()


# Keep a copy of the original test SEQN for the final submission file.
test_seqn = test_df['SEQN'] if 'SEQN' in test_df.columns else None

# --- 2. Exploratory Data Analysis (EDA) & Preprocessing ---

# Display basic information about the training data
print("--- Training Data Info ---")
train_df.info()
print("\n--- Missing Values in Training Data ---")
print(train_df.isnull().sum())

# Display basic information about the test data
print("\n--- Test Data Info ---")
test_df.info()
print("\n--- Missing Values in Test Data ---")
print(test_df.isnull().sum())


# --- 3. Target Variable Encoding ---
# The target variable 'age_group' is categorical ('Adult', 'Senior').
# We need to convert it to numerical format (0 and 1) for the model.
# 'Adult' will be 0 and 'Senior' will be 1, as specified.
if 'age_group' in train_df.columns:
    train_df['age_group'] = train_df['age_group'].apply(lambda x: 1 if x == 'Senior' else 0)
    print("\nTarget variable 'age_group' encoded successfully.")
    print(train_df['age_group'].value_counts())


# --- 4. Feature Selection ---
# Define the feature columns we will use to train the model.
# 'SEQN' is just an identifier and should not be used for training.
features = [
    'RIAGENDR',
    'PAQ605',
    'BMXBMI',
    'LBXGLU',
    'DIQ010',
    'LBXGLT',
    'LBXIN'
]

# Create our feature matrix (X) and target vector (y)
X = train_df[features]
y = train_df['age_group']

# Also prepare the test set with the same features
X_test = test_df[features]


# --- 5. Handling Missing Values (Imputation) ---
# Machine learning models cannot handle missing values (NaNs).
# We will fill them using the median of each respective column from the training data.
# Using the median is robust to outliers.
# We calculate the median from the training set and use it to fill NaNs
# in both the training and test sets to prevent data leakage.
for col in features:
    median_val = X[col].median()
    X[col].fillna(median_val, inplace=True)
    X_test[col].fillna(median_val, inplace=True)

print("\n--- Missing Values After Imputation ---")
print("Training Set:", X.isnull().sum().sum())
print("Test Set:", X_test.isnull().sum().sum())


# --- 6. Model Training ---
# We will use a RandomForestClassifier, which is a powerful and commonly used model
# for this type of problem. It's an ensemble of decision trees.
# We'll train the model on the entire training dataset.

# Initialize the model with some standard parameters.
# random_state ensures that the results are reproducible.
model = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=10, min_samples_leaf=5)

# Train the model
model.fit(X, y)

print("\nModel training complete.")


# --- 7. Make Predictions on the Test Set ---
# Now we use our trained model to predict the age group for the test data.
test_predictions = model.predict(X_test)

print("Predictions generated for the test set.")


# --- 8. Create Submission File ---
# We need to create a CSV file in the format specified by 'Sample_Submission.csv'.
# It should have two columns: 'SEQN' and 'age_group'.

if test_seqn is not None:
    submission_df = pd.DataFrame({
        'SEQN': test_seqn,
        'age_group': test_predictions
    })

    # Save the dataframe to a CSV file. index=False means the row numbers won't be saved.
    submission_df.to_csv('submission.csv', index=False)

    print("\nSubmission file 'submission.csv' has been created successfully!")
    print("--- First 5 rows of submission file ---")
    print(submission_df.head())
else:
    print("\nCould not create submission file because test 'SEQN' column was not found.")



--- Training Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1966 entries, 0 to 1965
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SEQN       1954 non-null   float64
 1   RIAGENDR   1948 non-null   float64
 2   PAQ605     1953 non-null   float64
 3   BMXBMI     1948 non-null   float64
 4   LBXGLU     1953 non-null   float64
 5   DIQ010     1948 non-null   float64
 6   LBXGLT     1955 non-null   float64
 7   LBXIN      1957 non-null   float64
 8   age_group  1952 non-null   object 
dtypes: float64(8), object(1)
memory usage: 138.4+ KB

--- Missing Values in Training Data ---
SEQN         12
RIAGENDR     18
PAQ605       13
BMXBMI       18
LBXGLU       13
DIQ010       18
LBXGLT       11
LBXIN         9
age_group    14
dtype: int64

--- Test Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ---

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(median_val, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to pe


Model training complete.
Predictions generated for the test set.

Submission file 'submission.csv' has been created successfully!
--- First 5 rows of submission file ---
      SEQN  age_group
0  77017.0          0
1  75580.0          0
2  73820.0          0
3  80489.0          0
4  82047.0          0
