In [348]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [349]:
# train_df = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
# test_df = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
train_df = pd.read_csv("data/s4e2/train.csv")
test_df = pd.read_csv("data/s4e2/test.csv")
origin_df = pd.read_csv("data/s4e2/ObesityDataSet.csv")
(train_df.shape, test_df.shape, origin_df.shape)

((20758, 18), (13840, 17), (2111, 17))

In [350]:
# # 计算 BIM 特征
# train_df['BIM'] = train_df['Weight'] / ((train_df['Height']) ** 2)
# test_df['BIM'] = test_df['Weight'] / ((test_df['Height']) ** 2)
# origin_df['BIM'] = origin_df['Weight'] / ((origin_df['Height']) ** 2)
# (train_df.shape, test_df.shape, origin_df.shape)

In [351]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

In [352]:
numeric_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'BIM']
category_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']

### 将category_cols中的object转为数字类型

In [353]:
def change_object_cols(se):
    value = se.unique().tolist()
    value.sort()
    return se.map(pd.Series(range(len(value)), index = value)).values

In [354]:
train_df = pd.concat([train_df, origin_df]).drop_duplicates()  # (22869, 18) -->  (22845, 18)

train_df['NObeyesdad'] = change_object_cols(train_df['NObeyesdad'])

train_target = train_df['NObeyesdad']

train_df = train_df.drop(columns=['NObeyesdad'])
train_df = train_df.drop(columns=['id'])
train_df.shape

(22845, 16)

In [355]:
train_df.head(5)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation
1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile
2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation
3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation


In [356]:
def featuring_processing(data):
    for col in category_cols[:-1]:
        data[col] = change_object_cols(data[col])
    return data

In [357]:

def generate_poly_feature(data):
# 初始化 PolynomialFeatures
    poly = PolynomialFeatures(degree=2, include_bias=False)
    # 生成所有特征的组合特征
    poly_features = poly.fit_transform(data)
    # 创建列名
    column_names = poly.get_feature_names_out(input_features=data.columns)
    # 将生成的特征转换为 DataFrame
    poly_df = pd.DataFrame(poly_features, columns=column_names)
    return poly_df



def scaled_processing(data):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    scaled_data = pd.DataFrame(scaled_data, columns = data.columns)
    return scaled_data

In [358]:
train_df.head(5)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation
1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile
2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation
3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation


In [359]:
train_df = pd.get_dummies(train_df, columns=category_cols[:-1])
test_df = pd.get_dummies(test_df, columns=category_cols[:-1])
train_df.head()


Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,24.443011,1.699998,81.66995,2.0,2.983297,2.763573,0.0,0.976473,False,True,...,False,False,False,True,False,False,False,False,True,False
1,18.0,1.56,57.0,2.0,3.0,2.0,1.0,1.0,True,False,...,False,False,False,False,True,True,False,False,False,False
2,18.0,1.71146,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584,True,False,...,False,False,False,False,True,False,False,False,True,False
3,20.952737,1.71073,131.274851,3.0,3.0,1.674061,1.467863,0.780199,True,False,...,False,False,False,True,False,False,False,False,True,False
4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721,False,True,...,False,False,False,True,False,False,False,False,True,False


In [360]:
def bool_to_int(x):
    if isinstance(x, bool):
        return int(x)
    else:
        return x

train_df = train_df.map(bool_to_int)
test_df = test_df.map(bool_to_int)


train_df.head(5)

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,24.443011,1.699998,81.66995,2.0,2.983297,2.763573,0.0,0.976473,0,1,...,0,0,0,1,0,0,0,0,1,0
1,18.0,1.56,57.0,2.0,3.0,2.0,1.0,1.0,1,0,...,0,0,0,0,1,1,0,0,0,0
2,18.0,1.71146,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584,1,0,...,0,0,0,0,1,0,0,0,1,0
3,20.952737,1.71073,131.274851,3.0,3.0,1.674061,1.467863,0.780199,1,0,...,0,0,0,1,0,0,0,0,1,0
4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721,0,1,...,0,0,0,1,0,0,0,0,1,0


In [361]:
# train_df = featuring_processing(train_df)
train_df = generate_poly_feature(train_df)
train_df = scaled_processing(train_df)
train_df.head(5)

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,...,MTRANS_Bike^2,MTRANS_Bike MTRANS_Motorbike,MTRANS_Bike MTRANS_Public_Transportation,MTRANS_Bike MTRANS_Walking,MTRANS_Motorbike^2,MTRANS_Motorbike MTRANS_Public_Transportation,MTRANS_Motorbike MTRANS_Walking,MTRANS_Public_Transportation^2,MTRANS_Public_Transportation MTRANS_Walking,MTRANS_Walking^2
0,0.096347,-0.005336,-0.23229,-0.831819,0.319839,1.210272,-1.172529,0.589741,-1.003025,1.003025,...,-0.041353,0.0,0.0,0.0,-0.046363,0.0,0.0,0.502119,0.0,-0.152918
1,-1.023162,-1.598693,-1.168077,-0.831819,0.343326,-0.044645,0.018358,0.628771,0.996984,-0.996984,...,-0.041353,0.0,0.0,0.0,-0.046363,0.0,0.0,-1.991558,0.0,-0.152918
2,-1.023162,0.125116,-1.427315,-1.055798,-1.890059,-0.191936,-0.141167,1.746217,0.996984,-0.996984,...,-0.041353,0.0,0.0,0.0,-0.046363,0.0,0.0,0.502119,0.0,-0.152918
3,-0.510107,0.116808,1.649336,1.043017,0.343326,-0.580319,0.57553,0.264131,0.996984,-0.996984,...,-0.041353,0.0,0.0,0.0,-0.046363,0.0,0.0,0.502119,0.0,-0.152918
4,1.347052,2.432398,0.227756,0.44244,-1.102923,-0.077764,1.171105,0.515499,-1.003025,1.003025,...,-0.041353,0.0,0.0,0.0,-0.046363,0.0,0.0,0.502119,0.0,-0.152918


In [362]:
X_train, X_test, y_train, y_test = train_test_split(train_df, train_target, test_size=0.2, random_state=42)

## 逻辑回归模型0.8725

In [363]:
# from sklearn.linear_model import LogisticRegression

# # 初始化逻辑回归模型
# logistic_regression = LogisticRegression(random_state=42)
# # 在训练数据上训练模型
# logistic_regression.fit(X_train, y_train)

# # 在测试数据上进行预测
# y_pred = logistic_regression.predict(X_test)
# print("Accuracy: ", accuracy_score(y_pred, y_test))
# # 0.87475915

## 支持向量机0.8798

In [364]:
# from sklearn.svm import SVC

# # 初始化支持向量机模型
# svm = SVC(kernel='linear', random_state=42)

# # 在训练数据上训练模型
# svm.fit(X_train, y_train)

# # 在测试数据上进行预测
# print("Accuracy: ", svm.score(X_test, y_test))

## 决策树0.8330

In [365]:
# from sklearn.tree import DecisionTreeClassifier

# # 初始化决策树分类器
# decision_tree = DecisionTreeClassifier(random_state=42)

# # 在训练数据上训练模型
# decision_tree.fit(X_train, y_train)

# # 在测试数据上进行预测
# print("Accuracy:", decision_tree.score(X_test, y_test))

## KNN分类器0.7567

In [366]:
# from sklearn.neighbors import KNeighborsClassifier

# # 初始化KNN分类器
# knn = KNeighborsClassifier()

# # 在训练数据上训练模型
# knn.fit(X_train, y_train)

# # 在测试数据上进行预测
# print("Accuracy:",  knn.score(X_test, y_test))

## 朴素贝叶斯0.7333

In [367]:
# from sklearn.naive_bayes import GaussianNB

# # 初始化朴素贝叶斯分类器
# naive_bayes = GaussianNB()

# # 在训练数据上训练模型
# naive_bayes.fit(X_train, y_train)

# # 在测试数据上进行预测
# print("Accuracy:",  naive_bayes.score(X_test, y_test))


## 极端随机树0.89041

In [368]:
# from sklearn.ensemble import ExtraTreesClassifier

# # 初始化Extra Trees分类器
# extra_trees = ExtraTreesClassifier(random_state=42)

# # 在训练数据上训练模型
# extra_trees.fit(X_train, y_train)

# # 在测试数据上进行预测
# print("Accuracy:",  extra_trees.score(X_test, y_test))


## 梯度提升树 0.897398  6min12s

In [369]:
# from sklearn.ensemble import GradientBoostingClassifier

# # 初始化梯度提升树分类器
# gradient_boosting = GradientBoostingClassifier(random_state=42)

# # 在训练数据上训练模型
# gradient_boosting.fit(X_train, y_train)

# # 在测试数据上进行预测
# print("Accuracy:",  gradient_boosting.score(X_test, y_test))


## 平衡随机森林0.897880

In [370]:
# from imblearn.ensemble import BalancedRandomForestClassifier

# # 初始化平衡随机森林分类器
# balanced_random_forest = BalancedRandomForestClassifier(random_state=42)

# # 在训练数据上训练模型
# balanced_random_forest.fit(X_train, y_train)

# # 在测试数据上进行预测
# print("Accuracy:",  balanced_random_forest.score(X_test, y_test))



## 随机森林0.8993

In [371]:
# rf_classifier = RandomForestClassifier(
#     n_estimators = 550,
#     # min_samples_leaf = 9,
#     random_state=42
# )
# rf_classifier.fit(X_train, y_train)


# # 5. 模型预测
# y_pred = rf_classifier.predict(X_test)
# # 6. 模型评估
# print("Accuracy:", rf_classifier.score(X_test, y_test))
# # 0.8952  # 没有特征融合：0.900529  # 0.8954
# # 0.8966  170特征  0.89643
# # 0.89547  {'min_samples_leaf': 9, 'n_estimators': 550}  0.895472

## 模型Voting融合

In [372]:
# from scipy.stats import mode

# voting_predictions = mode([y_pred, balanced_random_forest.predict(X_test)], axis=0)[0]

# # 最终的预测结果
# final_predictions = voting_predictions.ravel()
# print(final_predictions)

# print(accuracy_score(final_predictions, y_test))

In [373]:
# from xgboost import XGBClassifier

# xgb_2 = XGBClassifier(
#     learning_rate =0.01,
#     n_estimators=2000,
#     early_stopping_rounds=500,
#     max_depth=4,
#     min_child_weight=0.5,
#     gamma=0.2,
#     colsample_bytree=0.7,
#     subsample=0.7,
#     reg_alpha=1,
#     objective= 'multi:softmax', # softmax， softprob
#     nthread=-1,
#     random_state=42
# )
# xgb_2.fit(X_train, y_train, eval_set=[(X_test, y_test)])
# acc_train = accuracy_score(xgb_2.predict(X_train),y_train)
# acc_val = accuracy_score(xgb_2.predict(X_test),y_test)
# print(f"model: xgb_2\ntrain_accuracy: {acc_train:.4f}\ntest_accuracy: {acc_val:.4f}")
# # train_accuracy: 0.9893 test_accuracy: 0.9053  5000
# # train_accuracy: 0.9353 test_accuracy: 0.9082  2000

## XGboost 0.906310

In [374]:
from xgboost import XGBClassifier

# 初始化XGBoost分类器
xgb_classifier = XGBClassifier(objective='multi:softmax', num_class=7, random_state=42)

# 在训练数据上训练模型
xgb_classifier.fit(X_train, y_train)

# 在测试数据上进行预测
print("Test Accuracy:", xgb_classifier.score(X_test, y_test))
print("Train Accuracy:", xgb_classifier.score(X_train, y_train))
# 0.912672357189757  du + 01
# 0.908513898008317 + 通用特征

Test Accuracy: 0.9065441015539505
Train Accuracy: 0.9960604070912672


## LightGBM 0.903901

In [375]:
from lightgbm import LGBMClassifier

# Best parameters obtained from Optuna optimization process

best_params = {
    "objective": "multiclass",          # Objective function for the model
    "metric": "multi_logloss",          # Evaluation metric
    "verbosity": -1,                    # Verbosity level (-1 for silent)
    "boosting_type": "gbdt",            # Gradient boosting type
    "random_state": 42,       # Random state for reproducibility
    "num_class": 7,                     # Number of classes in the dataset
    'learning_rate': 0.030962211546832760,  # Learning rate for gradient boosting
    'n_estimators': 500,                # Number of boosting iterations
    'lambda_l1': 0.009667446568254372,  # L1 regularization term
    'lambda_l2': 0.04018641437301800,   # L2 regularization term
    'max_depth': 10,                    # Maximum depth of the trees
    'colsample_bytree': 0.40977129346872643,  # Fraction of features to consider for each tree
    'subsample': 0.9535797422450176,    # Fraction of samples to consider for each boosting iteration
    'min_child_samples': 26             # Minimum number of data needed in a leaf
}

# 初始化LightGBM分类器
# lgbm_classifier = LGBMClassifier(objective='multiclass', num_class=7, random_state=42)
lgbm_classifier = LGBMClassifier(**best_params)

# 在训练数据上训练模型
lgbm_classifier.fit(X_train, y_train)

# 在测试数据上进行预测
print("Test Accuracy:", lgbm_classifier.score(X_test, y_test))
# print("Train Accuracy:", lgbm_classifier.score(X_train, y_train))
# Test Accuracy: 0.912015758371635 Train Accuracy: 0.9818888159334647
# Test Accuracy: 0.9168308163711972 Train Accuracy: 0.9808492011381046
# 0.9131100897351718

Test Accuracy: 0.9139855548260013


In [179]:
# from scipy.stats import mode

# voting_predictions = mode([xgb_classifier.predict(X_test), lgbm_classifier.predict(X_test)], axis=0)[0]

# # 最终的预测结果
# final_predictions = voting_predictions.ravel()
# print(final_predictions)

# print(accuracy_score(final_predictions, y_test))

[4 5 4 ... 6 3 3]
0.912015758371635


In [208]:
# # 4. 模型训练
# # 使用网格搜索来调整模型参数
# param_grid = {
#     'n_estimators': [90, 100, 110],  # 尝试不同数量的树
# #     'max_depth': [None],  # 尝试不同深度的树
#     'min_samples_leaf': [9, 10],
# #     'min_samples_split': [2, 4],
# }  # {'max_depth': None, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 200}

# rf_classifier = RandomForestClassifier(random_state=42)
# grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
# grid_search.fit(X_train, y_train)
# print(grid_search.best_params_)
# best_rf_classifier = grid_search.best_estimator_

# # 5. 模型预测
# y_pred = best_rf_classifier.predict(X_test)

# # 6. 模型评估
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)  
# # {'min_samples_leaf': 9, 'n_estimators': 400} Accuracy: 0.894990366088632
# # {'min_samples_leaf': 9, 'n_estimators': 390} Accuracy: 0.894990366088632
# # {'min_samples_leaf': 9, 'n_estimators': 380} Accuracy: 0.8945086705202312
# {'min_samples_leaf': 9, 'n_estimators': 100} Accuracy: 0.8971579961464354

## 模型test

In [240]:
test_ids = test_df['id']
test_df = test_df.drop(columns=['id'])

# test_df = featuring_processing(test_df)
# test_df = generate_poly_feature(test_df)
# test_df = scaled_processing(test_df)
test_df.head(5)

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,26.899886,1.848294,120.644178,2.938616,3.0,2.825629,0.8554,0.0,False,True,...,False,False,False,True,False,False,False,False,True,False
1,21.0,1.6,66.0,2.0,1.0,3.0,1.0,0.0,True,False,...,False,False,False,True,False,False,False,False,True,False
2,26.0,1.643355,111.600553,3.0,3.0,2.621877,0.0,0.250502,True,False,...,False,False,False,True,False,False,False,False,True,False
3,20.979254,1.553127,103.669116,2.0,2.977909,2.786417,0.094851,0.0,False,True,...,False,False,False,True,False,False,False,False,True,False
4,26.0,1.627396,104.835346,3.0,3.0,2.653531,0.0,0.741069,True,False,...,False,False,False,True,False,False,False,False,True,False


In [210]:
# voting_predictions = mode([xgb_classifier.predict(test_df), lgbm_classifier.predict(test_df)], axis=0)[0]


In [211]:

# 最终的预测结果
# pred = voting_predictions.ravel()
# pred

array([3, 5, 4, ..., 0, 1, 3], dtype=int64)

In [241]:
pred = lgbm_classifier.predict(test_df)
pred

array(['Obesity_Type_II', 'Overweight_Level_I', 'Obesity_Type_III', ...,
       'Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_II'],
      dtype=object)

In [138]:
mapping = {
    0: 'Insufficient_Weight',
    1: 'Normal_Weight',
    2: 'Obesity_Type_I',
    3: 'Obesity_Type_II',
    4: 'Obesity_Type_III',
    5: 'Overweight_Level_I',
    6: 'Overweight_Level_II'
}

# 使用列表推导式将数组中的数字映射为字符串
sub_target = np.array([mapping[num] for num in pred])

print(sub_target)

['Obesity_Type_II' 'Overweight_Level_I' 'Obesity_Type_III' ...
 'Insufficient_Weight' 'Normal_Weight' 'Obesity_Type_II']


In [242]:
sub = pd.DataFrame({'id': test_ids, 'NObeyesdad': pred})

sub.to_csv('data/s4e2/submission_lgbm.csv', index=False)

print(sub)

          id           NObeyesdad
0      20758      Obesity_Type_II
1      20759   Overweight_Level_I
2      20760     Obesity_Type_III
3      20761       Obesity_Type_I
4      20762     Obesity_Type_III
...      ...                  ...
13835  34593  Overweight_Level_II
13836  34594   Overweight_Level_I
13837  34595  Insufficient_Weight
13838  34596        Normal_Weight
13839  34597      Obesity_Type_II

[13840 rows x 2 columns]
