In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [None]:
df = pd.read_csv("../input/df-train-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
df.head()

In [None]:
useful_var = [var for var in df.columns if var not in ['id' , 'target' , 'kfold']]
obj_cols = [ c for c in useful_var if 'cat' in c]
df_test = df_test[useful_var]

In [None]:
final_predictions = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_var]
    xvalid = xvalid[useful_var]
    
    ordinal_encoder = OrdinalEncoder()
    xtrain[obj_cols] = ordinal_encoder.fit_transform(xtrain[obj_cols])
    xvalid[obj_cols] = ordinal_encoder.transform(xvalid[obj_cols])
    xtest[obj_cols] = ordinal_encoder.transform(xtest[obj_cols])
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist',gpu_id=0,predictor='gpu_predictor')
    model.fit(xtrain, ytrain )
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    print(fold, mean_squared_error(yvalid, preds_valid, squared=False))

In [None]:
preds = np.mean(np.column_stack(final_predictions), axis=1)


In [None]:
# standardization

useful_var = [var for var in df.columns if var not in ['id' , 'target' , 'kfold']]
obj_cols = [ c for c in useful_var if 'cat' in c]
numerical_cols = [col for col in useful_var if "cont" in col]
df_test = df_test[useful_var]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_var]
    xvalid = xvalid[useful_var]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[obj_cols] = ordinal_encoder.fit_transform(xtrain[obj_cols])
    xvalid[obj_cols] = ordinal_encoder.transform(xvalid[obj_cols])
    xtest[obj_cols] = ordinal_encoder.transform(xtest[obj_cols])
    
    scaler = preprocessing.StandardScaler()
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
# polynomial features

df = pd.read_csv("../input/df-train-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_var = [var for var in df.columns if var not in ['id' , 'target' , 'kfold']]
obj_cols = [ c for c in useful_var if 'cat' in c]
numerical_cols = [col for col in useful_var if "cont" in col]
df_test = df_test[useful_var]

poly = preprocessing.PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(df[numerical_cols])
test_poly = poly.fit_transform(df_test[numerical_cols])

df_poly = pd.DataFrame(train_poly, columns=[f"poly_{i}" for i in range(train_poly.shape[1])])
df_test_poly = pd.DataFrame(test_poly, columns=[f"poly_{i}" for i in range(test_poly.shape[1])])

df = pd.concat([df, df_poly], axis=1)
df_test = pd.concat([df_test, df_test_poly], axis=1)

useful_var = [var for var in df.columns if var not in ['id' , 'target' , 'kfold']]
obj_cols = [ c for c in useful_var if 'cat' in c]
numerical_cols = [col for col in useful_var if "cont" in col]
df_test = df_test[useful_var]


final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_var]
    xvalid = xvalid[useful_var]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[obj_cols] = ordinal_encoder.fit_transform(xtrain[obj_cols])
    xvalid[obj_cols] = ordinal_encoder.transform(xvalid[obj_cols])
    xtest[obj_cols] = ordinal_encoder.transform(xtest[obj_cols])
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
# One-Hot Encoding

df = pd.read_csv("../input/df-train-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_var = [var for var in df.columns if var not in ['id' , 'target' , 'kfold']]
obj_cols = [ c for c in useful_var if 'cat' in c]
numerical_cols = [col for col in useful_var if "cont" in col]
df_test = df_test[useful_var]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_var]
    xvalid = xvalid[useful_var]
    
    ohe = preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore")
    xtrain_ohe = ohe.fit_transform(xtrain[obj_cols])
    xvalid_ohe = ohe.transform(xvalid[obj_cols])
    xtest_ohe = ohe.transform(xtest[obj_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f"ohe_{i}" for i in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f"ohe_{i}" for i in range(xvalid_ohe.shape[1])])
    xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f"ohe_{i}" for i in range(xtest_ohe.shape[1])])
    
    xtrain = pd.concat([xtrain, xtrain_ohe], axis=1)
    xvalid = pd.concat([xvalid, xvalid_ohe], axis=1)
    xtest = pd.concat([xtest, xtest_ohe], axis=1)
    
    # this part is missing in the video:
    xtrain = xtrain.drop(obj_cols, axis=1)
    xvalid = xvalid.drop(obj_cols, axis=1)
    xtest = xtest.drop(obj_cols, axis=1)
    # missing part ends
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))


In [None]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

sample_submission.target = preds
sample_submission.to_csv("submission.csv", index=False)