In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("max_r", 20)
np.set_printoptions(precision=5, suppress=True)

In [None]:
df = pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
y = df['posttest']
X = df.drop('posttest', axis=1)

In [None]:
X

In [None]:
y

In [None]:
num_cols = [col for col in X.columns if X[col].dtype in ['int64','float64']]
cat_cols = [col for col in X.columns if X[col].dtype == 'object']

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

preprocessor = ColumnTransformer(
    transformers = [
        ('ohe',OneHotEncoder(handle_unknown='ignore'),cat_cols)
    ]
)

In [None]:
from xgboost import XGBRegressor

model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', XGBRegressor())
    ]
)

model.get_params().keys()

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'model__n_estimators': [n for n in range(100,501,100)],
    'model__learning_rate': [0.01,0.05,0.1],
    'model__n_jobs': [4],
    'model__random_state': [0]
}

# gs = GridSearchCV(model,param_grid=params,cv=5,scoring='neg_mean_absolute_error')
# gs.fit(X,y)
# gs.best_params_

In [None]:
from sklearn.model_selection import cross_val_score

clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', XGBRegressor(n_estimators=100,learning_rate=0.1,n_jobs=4,random_state=0))
    ]
)

score = -1 * cross_val_score(clf,X,y,cv=5,scoring='neg_mean_absolute_error')
score.mean()