In [278]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

In [370]:
# Load datasets
train_df = pd.read_csv("Train_Data.csv")
test_df = pd.read_csv("Test_Data.csv")

In [371]:
missing_counts = train_df.isna().sum()
print(missing_counts)

SEQN         12
RIAGENDR     18
PAQ605       13
BMXBMI       18
LBXGLU       13
DIQ010       18
LBXGLT       11
LBXIN         9
age_group    14
dtype: int64


In [372]:
for col in train_df.columns:
    print(f"{col}: {train_df[col].unique()}")

SEQN: [73564. 73568. 73576. ... 83713. 83718. 83727.]
RIAGENDR: [ 2.  1. nan]
PAQ605: [ 2.  1. nan]
BMXBMI: [35.7 20.3 23.2 28.9 35.9 23.6 38.7 38.3 38.9 32.7 27.3 24.6 30.5 30.3
 22.1 37.8 28.  21.  33.  29.1 32.3 37.6 36.4 33.6 21.4 18.9 26.5 19.2
 36.2 26.4 22.5 27.5 25.6 37.5 39.7 21.1 19.  23.4 18.1 30.2 24.9 23.
 49.9 27.8 26.3 25.  19.7 28.6 22.6 41.1 19.5 50.9 28.1 30.7 30.8 21.5
 29.4 32.4 32.  29.7 24.2 22.4 26.8 39.2 31.9 31.2 25.4 24.5 18.4 23.5
 26.2 39.  35.  35.6 42.7  nan 23.9 47.2 34.2 40.7 24.3 37.  36.1 26.9
 14.5 29.5 21.3 49.2 30.6 33.3 19.8 16.8 28.2 45.1 30.4 39.8 47.7 22.7
 29.6 33.9 21.8 24.4 34.7 37.1 20.9 44.9 26.1 28.3 47.8 27.9 31.4 43.1
 28.5 21.7 38.5 48.6 27.2 29.  43.2 23.7 36.8 25.2 51.3 27.6 30.  34.
 43.3 32.6 21.6 18.6 33.2 25.8 44.1 44.6 31.5 20.8 26.7 40.  24.  29.2
 29.3 20.5 15.7 33.8 33.4 22.2 27.  33.1 34.6 34.8 18.2 45.7 38.8 17.1
 20.  26.6 27.7 29.8 29.9 32.1 31.8 17.9 68.6 35.1 42.2 31.1 27.4 19.1
 54.  50.2 17.3 20.7 37.2 19.6 32.9 23.8 3

In [373]:
len(train_df)

1966

In [374]:
train_df_cleaned = train_df.dropna()
len(train_df_cleaned)

1841

In [375]:
print(train_df['age_group'].value_counts())

age_group
Adult     1638
Senior     314
Name: count, dtype: int64


In [376]:
missing_counts = train_df_cleaned.isna().sum()
print(missing_counts)

SEQN         0
RIAGENDR     0
PAQ605       0
BMXBMI       0
LBXGLU       0
DIQ010       0
LBXGLT       0
LBXIN        0
age_group    0
dtype: int64


In [377]:
train_df = pd.get_dummies(train_df_cleaned, columns=["RIAGENDR", "PAQ605", "DIQ010"])

In [378]:
x_train, x_cv = train_test_split(train_df, test_size=0.2, random_state=42)

In [379]:
y_train= x_train['age_group']
y_cv = x_cv ['age_group']
print(y_train)

615      Adult
1011     Adult
1738     Adult
1834    Senior
1422     Adult
         ...  
1199     Adult
1374    Senior
915      Adult
1558     Adult
1195     Adult
Name: age_group, Length: 1472, dtype: object


In [380]:
# Drop 'SEQN' and 'age_group' columns
x_train = x_train.drop(columns=["SEQN", "age_group"])
x_cv = x_cv.drop(columns=["SEQN", "age_group"])
print(x_train)


      BMXBMI  LBXGLU  LBXGLT  LBXIN  RIAGENDR_1.0  RIAGENDR_2.0  PAQ605_1.0  \
615     24.1    93.0    81.0  10.55          True         False       False   
1011    22.0    94.0    74.0  16.15          True         False        True   
1738    23.7    93.0    58.0   5.06         False          True       False   
1834    18.1   119.0   229.0   8.44         False          True       False   
1422    21.8    81.0   126.0   2.62         False          True       False   
...      ...     ...     ...    ...           ...           ...         ...   
1199    27.5    90.0   103.0   9.18         False          True       False   
1374    29.8   111.0   141.0  11.43         False          True       False   
915     31.6   100.0    81.0   7.84          True         False       False   
1558    20.3    94.0   132.0   8.14         False          True       False   
1195    41.2   100.0   113.0  17.62          True         False       False   

      PAQ605_2.0  DIQ010_1.0  DIQ010_2.0  DIQ010_3.

In [381]:
smote = SMOTE(random_state=42)
x_train_re, y_train_re = smote.fit_resample(x_train, y_train)
print(len(x_train_re))
print(y_train_re.value_counts())

2466
age_group
Adult     1233
Senior    1233
Name: count, dtype: int64


In [316]:
adasyn = ADASYN(random_state=42)
x_train_re, y_train_re = adasyn.fit_resample(x_train, y_train)
print(len(x_train_re))
print(y_train_re.value_counts())

2471
age_group
Senior    1238
Adult     1233
Name: count, dtype: int64


In [382]:
x_test = test_df.drop(columns=["SEQN"])

In [383]:
missing_counts_test = x_test.isna().sum()
print(missing_counts_test)

RIAGENDR    2
PAQ605      1
BMXBMI      1
LBXGLU      1
DIQ010      1
LBXGLT      2
LBXIN       1
dtype: int64


In [384]:
# Impute mode for categorical columns
for col in ["RIAGENDR", "PAQ605", "DIQ010"]:
    mode_val = x_test[col].mode().iloc[0]
    x_test[col] = x_test[col].fillna(mode_val)

# Impute mean for numerical columns
for col in ["BMXBMI", "LBXGLU", "LBXGLT", "LBXIN"]:
    mean_val = x_test[col].mean()
    x_test[col] = x_test[col].fillna(mean_val)

In [385]:
x_test = pd.get_dummies(x_test, columns=["RIAGENDR", "PAQ605", "DIQ010"])

In [386]:
missing_counts_test = x_test.isna().sum()
print(missing_counts_test)

BMXBMI          0
LBXGLU          0
LBXGLT          0
LBXIN           0
RIAGENDR_1.0    0
RIAGENDR_2.0    0
PAQ605_1.0      0
PAQ605_2.0      0
DIQ010_1.0      0
DIQ010_2.0      0
DIQ010_3.0      0
dtype: int64


In [387]:
model = LogisticRegression( random_state=42,C=10, penalty='l1', solver='liblinear')
model.fit(x_train_re, y_train_re)

In [388]:
train_predictions = model.predict(x_train_re)

In [389]:
training_accuracy = accuracy_score(y_train_re, train_predictions)
print(training_accuracy)

0.7846715328467153


In [390]:
cv_predictions = model.predict(x_cv)

In [391]:
cv_accuracy = accuracy_score(y_cv, cv_predictions)
print(cv_accuracy)

0.8509485094850948


In [392]:
test_predictions = model.predict(x_test)

In [393]:
print(test_predictions)

['Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Senior' 'Adult' 'Adult'
 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult

In [394]:
test_predictions_encoded = np.where(test_predictions == 'Senior', 1, 0)

In [395]:
output_df = pd.DataFrame({
        'age_group': test_predictions_encoded
    })

In [396]:
print(output_df)

     age_group
0            0
1            0
2            0
3            0
4            0
..         ...
307          0
308          0
309          0
310          0
311          1

[312 rows x 1 columns]


In [397]:
output_df['age_group'].unique()

array([0, 1])

In [398]:
output_filename = 'logistic_predicted_age_group.csv'
output_df.to_csv(output_filename, index=False)

In [334]:
# Define the hyperparameter grid
logisticparam_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']  # these solvers support both l1 and l2
}

# Set up GridSearchCV
logisticgrid_search = GridSearchCV(estimator=model, param_grid=logisticparam_grid,
                           cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
logisticgrid_search.fit(x_train_re, y_train_re)

# Output best parameters and best model
print("Best parameters:", logisticgrid_search.best_params_)
best_logreg_model = logisticgrid_search.best_estimator_


Best parameters: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}


In [399]:
from sklearn.ensemble import RandomForestClassifier

In [400]:
RFmodel = RandomForestClassifier(random_state=42,criterion='gini', max_depth=20, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=100)
RFmodel.fit(x_train_re, y_train_re)

In [401]:
rftrain_predictions = RFmodel.predict(x_train_re)

In [402]:
rftraining_accuracy = accuracy_score(y_train_re, rftrain_predictions)
print(rftraining_accuracy)

1.0


In [403]:
rfcv_predictions = RFmodel.predict(x_cv)
rfcv_accuracy = accuracy_score(y_cv, rfcv_predictions)
print(rfcv_accuracy)

0.8265582655826558


In [404]:
rftest_predictions = RFmodel.predict(x_test)

In [405]:
rftest_predictions_encoded = np.where(rftest_predictions == 'Senior', 1, 0)

In [406]:
rfoutput_df = pd.DataFrame({
        'age_group': rftest_predictions_encoded
    })

In [407]:
rfoutput_filename = 'RF_predicted_age_group.csv'
rfoutput_df.to_csv(rfoutput_filename, index=False)

In [360]:
# Define the hyperparameter grid
rfparam_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2,3, 5,20],
    'min_samples_leaf': [1, 2,5],
    'max_features': ['sqrt', 'log2'],
    'criterion':['gini','entropy','log_loss']
}

# Set up GridSearchCV
rfgrid_search = GridSearchCV(estimator=RFmodel, param_grid=rfparam_grid, 
                           cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
rfgrid_search.fit(x_train_re, y_train_re)

# Output best parameters and model
print("Best parameters:", rfgrid_search.best_params_)
best_rf_model = rfgrid_search.best_estimator_

Best parameters: {'criterion': 'gini', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [135]:
from sklearn.naive_bayes import GaussianNB

In [149]:
GNBmodel = GaussianNB(var_smoothing=1e-08)
GNBmodel.fit(x_train_re, y_train_re)

In [150]:
gnbtrain_predictions = GNBmodel.predict(x_train_re)

In [151]:
gnbtraining_accuracy = accuracy_score(y_train_re, gnbtrain_predictions)
print(gnbtraining_accuracy)

0.7291159772911597


In [152]:
gnbcv_predictions = GNBmodel.predict(x_cv)
gnbcv_accuracy = accuracy_score(y_cv, gnbcv_predictions)
print(gnbcv_accuracy)

0.6856368563685636


In [153]:
gnbtest_predictions = GNBmodel.predict(x_test)

In [154]:
gnbtest_predictions_encoded = np.where(gnbtest_predictions == 'Senior', 1, 0)

In [155]:
gnboutput_df = pd.DataFrame({
        'age_group': gnbtest_predictions_encoded
    })

In [156]:
gnboutput_filename = 'GNB_predicted_age_group.csv'
gnboutput_df.to_csv(gnboutput_filename, index=False)

In [157]:
# Define the hyperparameter grid
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=GNBmodel, param_grid=param_grid, 
                           cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(x_train_re, y_train_re)

# Output best parameters and model
print("Best parameters:", grid_search.best_params_)
best_gnb_model = grid_search.best_estimator_

Best parameters: {'var_smoothing': 1e-08}


In [159]:
y_train_enc = np.where(y_train_re == 'Senior', 1, 0)
y_cv_enc=np.where(y_cv == 'Senior',1,0)
print(y_train_enc)

[0 0 0 ... 1 1 1]


In [171]:
xgb_model = XGBClassifier(learning_rate=0.2, max_depth= 7, n_estimators= 200)
xgb_model.fit(x_train_re, y_train_enc)

In [172]:
xgbtrain_predictions = xgb_model.predict(x_train_re)

In [173]:
xgbtraining_accuracy = accuracy_score(y_train_enc, xgbtrain_predictions)
print(xgbtraining_accuracy)

1.0


In [174]:
xgbcv_predictions = xgb_model.predict(x_cv)
xgbcv_accuracy = accuracy_score(y_cv_enc, xgbcv_predictions)
print(xgbcv_accuracy)

0.7831978319783198


In [175]:
xgbtest_predictions = xgb_model.predict(x_test)

In [176]:
print(xgbtest_predictions)

[0 0 0 0 0 0 0 1 0 0 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1
 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0]


In [177]:
xgboutput_df = pd.DataFrame({
        'age_group': xgbtest_predictions
    })

In [178]:
xgboutput_filename = 'XGB_predicted_age_group.csv'
xgboutput_df.to_csv(xgboutput_filename, index=False)

In [179]:
# Define the base model
xgb_model = XGBClassifier(eval_metric='logloss')

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Set up GridSearchCV
xgbgrid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit on training data
xgbgrid_search.fit(x_train_re, y_train_enc)

# Output the best parameters and best model
print("Best parameters:", xgbgrid_search.best_params_)
best_xgb_model = xgbgrid_search.best_estimator_

Best parameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200}


In [180]:
from sklearn.svm import SVC

In [223]:
svcmodel = SVC()
svcmodel.fit(x_train_re, y_train_re)

In [224]:
svctrain_predictions = svcmodel.predict(x_train_re)

In [225]:
svctraining_accuracy = accuracy_score(y_train_re, svctrain_predictions)
print(svctraining_accuracy)

0.6849148418491484


In [226]:
svccv_predictions = svcmodel.predict(x_cv)
svccv_accuracy = accuracy_score(y_cv, svccv_predictions)
print(svccv_accuracy)

0.7425474254742548


In [227]:
svctest_predictions = svcmodel.predict(x_test)

In [228]:
svctest_predictions_encoded = np.where(svctest_predictions == 'Senior', 1, 0)

In [229]:
svcoutput_df = pd.DataFrame({
        'age_group': svctest_predictions_encoded
    })

In [201]:
svcoutput_filename = 'SVC_predicted_age_group.csv'
svcoutput_df.to_csv(svcoutput_filename, index=False)

In [230]:
# Define the hyperparameter grid
svcparam_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly']
}

# Set up GridSearchCV
svcgrid_search = GridSearchCV(estimator=svcmodel, param_grid=svcparam_grid,
                           cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
svcgrid_search.fit(x_train_re, y_train_re)

# Output best parameters and model
print("Best parameters:", svcgrid_search.best_params_)
best_svc_model = svcgrid_search.best_estimator_

Best parameters: {'C': 10, 'kernel': 'linear'}


In [202]:
from sklearn.ensemble import AdaBoostClassifier

In [214]:
adamodel = AdaBoostClassifier(random_state=42,learning_rate=0.1, n_estimators=200)
adamodel.fit(x_train_re, y_train_re)



In [215]:
adatrain_predictions = adamodel.predict(x_train_re)

In [216]:
adatraining_accuracy = accuracy_score(y_train_re, adatrain_predictions)
print(adatraining_accuracy)

0.8158961881589619


In [217]:
adacv_predictions = adamodel.predict(x_cv)
adacv_accuracy = accuracy_score(y_cv, adacv_predictions)
print(adacv_accuracy)

0.7696476964769647


In [218]:
adatest_predictions = adamodel.predict(x_test)
print(adatest_predictions)

['Adult' 'Senior' 'Senior' 'Adult' 'Adult' 'Adult' 'Senior' 'Senior'
 'Adult' 'Adult' 'Senior' 'Senior' 'Adult' 'Senior' 'Senior' 'Adult'
 'Adult' 'Senior' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Senior'
 'Senior' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult'
 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Senior'
 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Senior' 'Adult' 'Senior'
 'Senior' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult'
 'Adult' 'Senior' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult'
 'Adult' 'Adult' 'Senior' 'Senior' 'Adult' 'Adult' 'Adult' 'Senior'
 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'A

In [219]:
adatest_predictions_encoded = np.where(adatest_predictions == 'Senior', 1, 0)
print(adatest_predictions_encoded)

[0 1 1 0 0 0 1 1 0 0 1 1 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0
 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 1 0 1 1 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 0 1 0 1 0 1 0 1 0 1 1 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1
 0 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]


In [220]:
adaoutput_df = pd.DataFrame({
        'age_group': adatest_predictions_encoded
    })

In [213]:
# Define hyperparameter grid
adaparam_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0]
}

# Set up GridSearchCV
adagrid_search = GridSearchCV(estimator=adamodel, param_grid=adaparam_grid,
                           cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
adagrid_search.fit(x_train_re, y_train_re)

# Output best parameters and model
print("Best parameters:", adagrid_search.best_params_)
best_ada_model = adagrid_search.best_estimator_



Best parameters: {'learning_rate': 1.0, 'n_estimators': 200}


In [221]:
adaoutput_filename = 'ADA_predicted_age_group.csv'
adaoutput_df.to_csv(adaoutput_filename, index=False)

In [231]:
from catboost import CatBoostClassifier

In [242]:
catmodel = CatBoostClassifier(verbose=0,depth=8, iterations=200, learning_rate=0.01)  # Set verbose=0 to suppress training output
catmodel.fit(x_train_re, y_train_re)

<catboost.core.CatBoostClassifier at 0x140c9f46b90>

In [243]:
cattrain_predictions = catmodel.predict(x_train_re)

In [244]:
cattraining_accuracy = accuracy_score(y_train_re, cattrain_predictions)
print(cattraining_accuracy)

0.8653690186536902


In [245]:
catcv_predictions = catmodel.predict(x_cv)
catcv_accuracy = accuracy_score(y_cv, catcv_predictions)
print(catcv_accuracy)

0.8102981029810298


In [246]:
cattest_predictions = catmodel.predict(x_test)
print(cattest_predictions)

['Adult' 'Senior' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior'
 'Adult' 'Adult' 'Senior' 'Senior' 'Adult' 'Senior' 'Senior' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior'
 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Senior' 'Senior'
 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult'
 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult'
 'Senior' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult'
 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adu

In [247]:
cattest_predictions_encoded = np.where(cattest_predictions == 'Senior', 1, 0)
print(cattest_predictions_encoded)

[0 1 1 0 0 0 0 1 0 0 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0
 0 0 1 0 0 0 0 1 1 0 0 0 0 1 1 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1
 0 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]


In [248]:
catoutput_df = pd.DataFrame({
        'age_group': cattest_predictions_encoded
    })

In [249]:
catoutput_filename = 'CAT_predicted_age_group.csv'
catoutput_df.to_csv(catoutput_filename, index=False)

In [241]:
# Define the parameter grid
catparam_grid = {
    'iterations': [100, 200],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1]
}

# Set up GridSearchCV
catgrid_search = GridSearchCV(estimator=catmodel, param_grid=catparam_grid,
                           cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
catgrid_search.fit(x_train, y_train)

# Best parameters and model
print("Best parameters:", catgrid_search.best_params_)
best_catboost_model = catgrid_search.best_estimator_

Best parameters: {'depth': 8, 'iterations': 200, 'learning_rate': 0.01}


In [250]:
from sklearn.ensemble import GradientBoostingClassifier

In [270]:
gbcmodel = GradientBoostingClassifier()
gbcmodel.fit(x_train_re, y_train_re)

In [271]:
gbctrain_predictions = gbcmodel.predict(x_train_re)

In [272]:
gbctraining_accuracy = accuracy_score(y_train_re, gbctrain_predictions)
print(gbctraining_accuracy)

0.8698296836982968


In [273]:
gbccv_predictions = catmodel.predict(x_cv)
gbccv_accuracy = accuracy_score(y_cv, gbccv_predictions)
print(gbccv_accuracy)

0.8102981029810298


In [274]:
gbctest_predictions = gbcmodel.predict(x_test)
print(gbctest_predictions)

['Adult' 'Senior' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior'
 'Adult' 'Adult' 'Senior' 'Senior' 'Adult' 'Senior' 'Senior' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Senior'
 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior'
 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Senior' 'Senior'
 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult'
 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult'
 'Senior' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Senior' 'Senior' 'Adult' 'Adult' 'Senior' 'Adult' 'Adult'
 'Adult' 'Adult' 'Senior' 'Adult' 'Adult' 'Adult' 'Adult' 'Senior' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'
 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult' 'Adult'


In [275]:
gbctest_predictions_encoded = np.where(gbctest_predictions == 'Senior', 1, 0)
print(gbctest_predictions_encoded)

[0 1 1 0 0 0 0 1 0 0 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0
 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1
 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]


In [276]:
gbcoutput_df = pd.DataFrame({
        'age_group': gbctest_predictions_encoded
    })

In [277]:
gbcoutput_filename = 'GBC_predicted_age_group.csv'
gbcoutput_df.to_csv(gbcoutput_filename, index=False)

In [261]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=gbcmodel, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit on training data
grid_search.fit(x_train, y_train)

# Best model and parameters
best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

Best parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
