In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

train=pd.read_csv("/kaggle/input/ai-201-b-mse-2-ai-c/train.csv")
test=pd.read_csv("/kaggle/input/ai-201-b-mse-2-ai-c/test.csv")

train.info()
train.head(10)
train.isnull().sum()
test.isnull().sum()
test_id=test['id']
test=test.drop(columns=['id'])
X=train.drop(columns=['NObeyesdad'])
y=train['NObeyesdad']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
numeric_feature=X.select_dtypes(include=['int64','float64']).columns
categorical_features=X.select_dtypes(include=['object']).columns
numeric_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

categorical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('scaler',OneHotEncoder(handle_unknown='ignore'))
])

preprocessor=ColumnTransformer(transformers=[
    ('num',numeric_pipeline,numeric_feature),
    ('cat',categorical_pipeline,categorical_features)
])

model=GradientBoostingClassifier(
    n_estimators=920,        # number of trees
    learning_rate=0.025,      # smaller learning rate usually better
    max_depth=4,             # depth of each tree
    subsample=0.7,           # fraction of samples for each tree
    min_samples_split=5,     # minimum samples to split a node
    min_samples_leaf=2,      # minimum samples in a leaf
    max_features='sqrt',     # number of features to consider for best split
    random_state=42
)

pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',model)
])

pipeline.fit(X_train,y_train)
y_pred=pipeline.predict(X_test)
accu=accuracy_score(y_pred,y_test)
print(accu)
y_final=pipeline.predict(test)
print(y)
print(y_final)
submission = pd.DataFrame({
    'id': test_id,
    'NObeyesdad': y_final
})
submission.to_csv('submission.csv', index=False)