In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report


In [None]:
train_data = pd.read_csv('train.csv')
train_data.head()


### **the data is clean and we don't need any further cleaning**

###**lets add some futures**

In [None]:
def future_adder(data):

    data['BMI'] = data['Weight'] / (data['Height'] / 100) ** 2
    data['BMI_category'] = pd.cut(data['BMI'], bins=[0, 18.5, 25, 30, np.inf], labels=['Underweight', 'Normal weight', 'Overweight', 'Obese'])
    data['BMI_category'],_ = data['BMI_category'].factorize()

    data = pd.get_dummies(data, columns=['BMI_category'], prefix='BMI_cat')

    data['body_fat_percentage'] = np.where(data['Gender'] == 'Male', 1.20 * data['BMI'] + 0.23 * data['Age'] - 16.2,
                                        1.20 * data['BMI'] + 0.23 * data['Age'] - 5.4)

    data['caloric_intake'] = np.where(data['Gender'] == 'Male',
                                    88.362 + (13.397 * data['Weight']) + (4.799 * data['Height']) - (5.677 * data['Age']),
                                    447.593 + (9.247 * data['Weight']) + (3.098 * data['Height']) - (4.330 * data['Age']))


    data['BMR']  = 88.362 + (13.397 * data['Weight']) + (4.799 * data['Height']) - (5.677 * data['Age'])
    data['RMR'] = 10 * data['Weight'] + 6.25 * data['Height'] - 5 * data['Age'] + 5

    data['TDEE'] = data['BMR'] * 1.2
    data['Metabolic Rate'] = data['BMR'] + data['TDEE']
    gender_map = {'Male':2,'Female':1}
    data['body_fat_percentage'] = 0.8 * data['BMI'] + 1.4 * data['Age'] - 10.4 * data['Gender'].map(gender_map) - 9

    bins = [0, 18, 30, 40, 50, np.inf]
    labels = ['0-18', '19-30', '31-40', '41-50', '50+']
    data['Age_Group'] = pd.cut(data['Age'], bins=bins, labels=labels)


    data['Physical_Activity_Level'] = data['FAF'] * data['TUE']
    data['water_intake_ratio'] = data['CH2O'] / data['Weight']

    Overall_Health_Score = {'yes':1,'no':0}
    data['Overall_Health_Score'] = data['SMOKE'].map(Overall_Health_Score) + data['SCC'].map(Overall_Health_Score) + data['CH2O']

    calc = {'no':0,'Sometimes':1,'Frequently':2,'Always':3}
    data['Dietary_Habits_Score'] = data['FAVC'].map(Overall_Health_Score) + data['FCVC'].round(0) - data['CALC'].map(calc)

    transportation_mapping = {'Public_Transportation': 1, 'Walking': 2, 'Automobile': 3,'Motorbike':4,
       'Bike':5}
    data['Transportation_Score'] = data['MTRANS'].map(transportation_mapping) * data['FAF']

    data['Composite_Health_Score'] = (data['Overall_Health_Score'] + data['Dietary_Habits_Score'] + data['Transportation_Score']) / 3
    data['Age_Physical_Activity_Interaction'] = data['Age'] * data['Physical_Activity_Level']
    data['caloric_intake_per_kg'] = data['caloric_intake'] / data['Weight']

    data['LBM'] = data['Weight'] - (data['body_fat_percentage'] * data['Weight'])


    return data


In [None]:
test_data_ = future_adder(train_data)


In [None]:
test_data_.dtypes

In [None]:
test_data_['BMI_cat_0'] = test_data_['BMI_cat_0'] .astype(np.float64)


In [None]:
catagorical_futures =[]
for column in test_data_.select_dtypes(['object','category']):
    # Apply factorize to the column
    catagorical_futures.append(column)
    test_data_[column], levels = test_data_[column].factorize()

In [None]:
X = test_data_.copy()

y = X.pop('NObeyesdad')

In [None]:
column1 = X.columns
scaler = StandardScaler()
list = []

for values in column1:
    if values not in catagorical_futures:
        list.append(values)

In [None]:
for x in list:
    X[x] = scaler.fit_transform(X[[x]])

In [None]:
corr_matrix = test_data_.corr()
values = corr_matrix['NObeyesdad'].sort_values(ascending=False)

In [None]:
discrete_features =X.dtypes == object


In [None]:
from sklearn.feature_selection import mutual_info_classif
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores[::3]  # show a few features with their MI scores

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,y,random_state=42,test_size=0.1)

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
x_resmapled,y_resampled = ros.fit_resample(X_train,Y_train)


In [None]:
x_resmapled.shape,y_resampled.shape

In [None]:
for i in range(0,7):
    print(f'class{i}: {sum(y_resampled==i)}')

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
model = tf.keras.Sequential()
model.add(Dense(64, input_dim=x_resmapled.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(72, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(7, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [None]:
model.fit(x_resmapled,y_resampled, epochs=50, batch_size=50, verbose=1, validation_split=0.2)


In [None]:
pred = model.predict(X_test)
pred = np.argmax(pred,axis=1)

In [None]:

print(classification_report(Y_test,pred))

In [None]:
import lightgbm as lgb

model2 = lgb.LGBMClassifier(learning_rate=0.1,iterations=300)

# Train the model
model2.fit(x_resmapled,y_resampled)

In [None]:
pred = model2.predict(X_test)
print(classification_report(Y_test,pred))

In [None]:
prediction = model2.predict(X)


In [None]:
feature_importances = model2.feature_importances_

# Print feature importances
for i, feature_name in enumerate(X_train.columns):
    print(f'{feature_name}:  {feature_importances[i]}')

In [None]:
from lightgbm import LGBMClassifier

lgbm_model = LGBMClassifier()

lgbm_model.fit(x_resmapled,y_resampled)

feature_importances = lgbm_model.feature_importances_

pred = lgbm_model.predict(X_test)
print(classification_report(Y_test,pred))

threshold = 400

selected_features = [feature for feature, importance in zip(X_train.columns, feature_importances) if importance >= threshold]

X_train_selected = x_resmapled[selected_features]
X_test_selected = X_test[selected_features]

lgbm_model_selected = LGBMClassifier()
lgbm_model_selected.fit(X_train_selected, y_resampled)

pred2 = lgbm_model_selected.predict(X_test_selected)
print(classification_report(Y_test,pred2))

In [None]:
pred = model2.predict(X_test)
print(classification_report(Y_test,pred))

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier
from catboost import CatBoostClassifier

model3 = CatBoostClassifier(iterations=2000)

model3.fit(x_resmapled,y_resampled)


In [None]:
pred1 = model3.predict(X_test)
print(classification_report(Y_test,pred1))

In [None]:
import xgboost as xgb


model = xgb.XGBClassifier()
model.fit(x_resmapled,y_resampled)



In [None]:
y_pred = model.predict(X_test)
print(classification_report(Y_test, y_pred))

In [None]:
from sklearn.svm import SVC

svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train,Y_train)

svm_predictions = svm_classifier.predict(X_test)


In [None]:
print(classification_report(Y_test,svm_predictions))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=500)
rf_classifier.fit(x_resmapled,y_resampled)

# Predict on the test set
rf_predictions = rf_classifier.predict(X_test)

In [None]:
print(classification_report(Y_test,rf_predictions))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_classifier = GradientBoostingClassifier(n_estimators=200)
gb_classifier.fit(x_resmapled,y_resampled)




In [None]:
gb_predictions = gb_classifier.predict(X_test)
print(classification_report(Y_test,gb_predictions))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

def model_parmetr_geter(model,x_train,y_train):
    base_estimator = model

    # Define the hyperparameter grid
    params = {
    'eta': 0.1,
    'max_depth': 3,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda': 1,
    'alpha': 0,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'n_estimators': 1000,
    'early_stopping_rounds': 10
}



    # Define the random search
    random_search = RandomizedSearchCV(estimator=base_estimator, param_distributions=param_grid, n_iter=10, cv=5, random_state=42)

    # Perform the random search
    random_search.fit(x_train, y_train)
    # Get the best parameters and best score
    best_params = random_search.best_params_
    best_score = random_search.best_score_

    return best_params,best_score


In [None]:
best_params,best_score = model_parmetr_geter(SVC(),x_resmapled,y_resampled)

print("Best Parameters:", best_params)
print("Best Score:", best_score)


In [None]:
m = SVC()
m.fit(x_resmapled,y_resampled)
valu = m.predict(X_test)
print(classification_report(Y_test,valu))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model4 = ExtraTreesClassifier(n_estimators=200)
model4.fit(x_resmapled,y_resampled)


In [None]:
predictions = model4.predict(X_test)
print(classification_report(Y_test,predictions))

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier


# Define base classifiers
clf1 =  lgb.LGBMClassifier()
clf2 = xgb.XGBClassifier()
clf4 = GradientBoostingClassifier(random_state=42)
clf3 = ExtraTreesClassifier(n_estimators=200)


# Create a voting classifier with the individual classifiers
voting_clf = VotingClassifier(estimators=[
    ('rf', clf1),
    ('gb', clf2),
    ('lr', clf4),
    ('ex',clf3)
], voting='hard')  # Use 'hard' for hard voting

voting_clf = voting_clf.fit(x_resmapled,y_resampled)


In [None]:
new = voting_clf.predict(X_test)
print(classification_report(Y_test,new))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression


base_models = [
    ('rf', LGBMClassifier()),  # LightGBM
    ('lr', xgb.XGBClassifier()),  # XGBoost
    ('clf2', GradientBoostingClassifier(random_state=42)),  # Gradient Boosting
    ('clf3',ExtraTreesClassifier(n_estimators=200))

]

final_estimator = LGBMClassifier()
stacking_model = StackingClassifier(estimators=base_models, final_estimator=final_estimator)

stacking_model.fit(x_resmapled,y_resampled)


In [None]:
y_pred = stacking_model.predict(X_test)
print(classification_report(Y_test,y_pred))