In [None]:
#Importing useful Libraries 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
final_scores_mean = []
final_scores_std = []

XGBoost and Ordinal Encoding

In [None]:
data_train = pd.read_csv("../input/30days-folds/train_folds.csv")
data_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in data_train.columns if c not in ('id', 'target', 'kfold')]
obj_cols = [col for col in useful_features if 'cat' in col]
X_test = data_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    X_train = data_train[data_train.kfold != fold].reset_index(drop = True)
    X_valid = data_train[data_train.kfold == fold].reset_index(drop = True)
    X_test = X_test.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    X_train[obj_cols] = ordinal_encoder.fit_transform(X_train[obj_cols])
    X_valid[obj_cols] = ordinal_encoder.transform(X_valid[obj_cols])
    X_test[obj_cols] = ordinal_encoder.transform(data_test[obj_cols])
   
    model = XGBRegressor(random_state = fold, tree_method= 'gpu_hist', gpu_id = 0, predictor='gpu_predictor')
    model.fit(X_train, y_train)
    predictions_valid = model.predict(X_valid)
    test_predictions = model.predict(X_test)
    final_predictions.append(test_predictions)
    rmse = mean_squared_error(y_valid, predictions_valid, squared = False)
    scores.append(rmse)
    print("RMSE :", fold, rmse)
    
print("Final Scores")
final_scores_mean.append(np.mean(scores))
final_scores_std.append(np.std(scores))
print(np.mean(scores), np.std(scores))

Standardization (Using Scaler Method)

In [None]:
#Standardization
data_train = pd.read_csv("../input/30days-folds/train_folds.csv")
data_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in data_train.columns if c not in ('id', 'target', 'kfold')]
obj_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
X_test = data_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    X_train = data_train[data_train.kfold != fold].reset_index(drop = True)
    X_valid = data_train[data_train.kfold == fold].reset_index(drop = True)
    X_test = X_test.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    X_train[obj_cols] = ordinal_encoder.fit_transform(X_train[obj_cols])
    X_valid[obj_cols] = ordinal_encoder.transform(X_valid[obj_cols])
    X_test[obj_cols] = ordinal_encoder.transform(data_test[obj_cols])
    
    scaler = preprocessing.StandardScaler()
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_valid[numerical_cols] = scaler.transform(X_valid[numerical_cols])
    X_test[numerical_cols] = scaler.transform(data_test[numerical_cols])
    
    model = XGBRegressor(random_state = fold, tree_method= 'gpu_hist', gpu_id = 0, predictor='gpu_predictor')
    model.fit(X_train, y_train)
    predictions_valid = model.predict(X_valid)
    test_predictions = model.predict(X_test)
    final_predictions.append(test_predictions)
    rmse = mean_squared_error(y_valid, predictions_valid, squared = False)
    scores.append(rmse)
    print("RMSE :", fold, rmse)
    
print("Final Scores")
final_scores_mean.append(np.mean(scores))
final_scores_std.append(np.std(scores))
print(np.mean(scores), np.std(scores))

Using Normalizer instead of StandardScaler

In [None]:
#Using Normalizer instead of StandardScaler
data_train = pd.read_csv("../input/30days-folds/train_folds.csv")
data_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in data_train.columns if c not in ('id', 'target', 'kfold')]
obj_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
X_test = data_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    X_train = data_train[data_train.kfold != fold].reset_index(drop = True)
    X_valid = data_train[data_train.kfold == fold].reset_index(drop = True)
    X_test = X_test.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    X_train[obj_cols] = ordinal_encoder.fit_transform(X_train[obj_cols])
    X_valid[obj_cols] = ordinal_encoder.transform(X_valid[obj_cols])
    X_test[obj_cols] = ordinal_encoder.transform(data_test[obj_cols])
    
    normalizer = preprocessing.Normalizer()
    X_train[numerical_cols] = normalizer.fit_transform(X_train[numerical_cols])
    X_valid[numerical_cols] = normalizer.transform(X_valid[numerical_cols])
    X_test[numerical_cols] = normalizer.transform(data_test[numerical_cols])
    
    model = XGBRegressor(random_state = fold, tree_method= 'gpu_hist', gpu_id = 0, predictor='gpu_predictor')
    model.fit(X_train, y_train)
    predictions_valid = model.predict(X_valid)
    test_predictions = model.predict(X_test)
    final_predictions.append(test_predictions)
    rmse = mean_squared_error(y_valid, predictions_valid, squared = False)
    scores.append(rmse)
    print("RMSE :", fold, rmse)
    
print("Final Scores")
final_scores_mean.append(np.mean(scores))
final_scores_std.append(np.std(scores))
print(np.mean(scores), np.std(scores))

Log Tranformation

In [None]:
#Log Tranformation
data_train = pd.read_csv("../input/30days-folds/train_folds.csv")
data_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in data_train.columns if c not in ('id', 'target', 'kfold')]
obj_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
data_test = data_test[useful_features]

for col in numerical_cols:
    data_train[col] = np.log1p(data_train[col])
    data_test[col] = np.log1p(data_test[col])
   
final_predictions =[]
scores = []
for fold in range(5):
    X_train = data_train[data_train.kfold != fold].reset_index(drop = True)
    X_valid = data_train[data_train.kfold == fold].reset_index(drop = True)
    X_test = data_test.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
  
    ordinal_encoder = OrdinalEncoder()
    X_train[obj_cols] = ordinal_encoder.fit_transform(X_train[obj_cols])
    X_valid[obj_cols] = ordinal_encoder.transform(X_valid[obj_cols])
    X_test[obj_cols] = ordinal_encoder.transform(X_test[obj_cols])
    
    scaler = preprocessing.StandardScaler()
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_valid[numerical_cols] = scaler.transform(X_valid[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
    
    model = XGBRegressor(random_state = fold, tree_method= 'gpu_hist', gpu_id = 0, predictor='gpu_predictor')
    model.fit(X_train, y_train)
    predictions_valid = model.predict(X_valid)
    test_predictions = model.predict(X_test)
    final_predictions.append(test_predictions)
    rmse = mean_squared_error(y_valid, predictions_valid, squared = False)
    scores.append(rmse)
    print("RMSE :", fold, rmse)
    
print("Final Scores")
final_scores_mean.append(np.mean(scores))
final_scores_std.append(np.std(scores))
print(np.mean(scores), np.std(scores))

Polynomial features

In [None]:
#Polynomial features
data_train = pd.read_csv("../input/30days-folds/train_folds.csv")
data_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in data_train.columns if c not in ('id', 'target', 'kfold')]
obj_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
data_test = data_test[useful_features]


poly = preprocessing.PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False)
train_poly = poly.fit_transform(data_train[numerical_cols])
test_poly = poly.fit_transform(data_test[numerical_cols])

train_poly_columns = [f"poly_{i}" for i in range(train_poly.shape[1])]
test_poly_columns = [f"poly_{i}" for i in range(test_poly.shape[1])]

df_poly_train = pd.DataFrame(train_poly, columns = train_poly_columns)
df_poly_test = pd.DataFrame(test_poly, columns = test_poly_columns )

data_train = pd.concat([data_train, df_poly_train], axis =1)
data_test = pd.concat([data_test, df_poly_test], axis = 1)

useful_features = [c for c in data_train.columns if c not in ('id', 'target', 'kfold')]
obj_cols = [col for col in useful_features if 'cat' in col]
data_test = data_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    X_train = data_train[data_train.kfold != fold].reset_index(drop = True)
    X_valid = data_train[data_train.kfold == fold].reset_index(drop = True)
    X_test = data_test.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    X_train[obj_cols] = ordinal_encoder.fit_transform(X_train[obj_cols])
    X_valid[obj_cols] = ordinal_encoder.transform(X_valid[obj_cols])
    X_test[obj_cols] = ordinal_encoder.transform(X_test[obj_cols])
    
    model = XGBRegressor(random_state = fold, tree_method= 'gpu_hist', gpu_id = 0, predictor='gpu_predictor')
    model.fit(X_train, y_train)
    predictions_valid = model.predict(X_valid)
    test_predictions = model.predict(X_test)
    final_predictions.append(test_predictions)
    rmse = mean_squared_error(y_valid, predictions_valid, squared = False)
    scores.append(rmse)
    print("RMSE: ", fold, rmse)
print("Final Scores")
final_scores_mean.append(np.mean(scores))
final_scores_std.append(np.std(scores))
print(np.mean(scores), np.std(scores))

Using XGboost + OneHotEncoding and Pipleline

In [None]:
#Using XGboost + OneHotEncoding and Pipleline
data_train = pd.read_csv("../input/30days-folds/train_folds.csv")
data_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in data_train.columns if c not in ('id', 'target', 'kfold')]
obj_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
data_test = data_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    X_train = data_train[data_train.kfold != fold].reset_index(drop = True)
    X_valid = data_train[data_train.kfold == fold].reset_index(drop = True)
    X_test = data_test.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    numerical_transformer = SimpleImputer(strategy='constant')
    
    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore'))])
    
    preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),
                                               ('cat', categorical_transformer, obj_cols)
                                              ])
    
    model_xgboost = XGBRegressor(random_state = fold, tree_method= 'gpu_hist', gpu_id = 0, predictor='gpu_predictor')
    pipeline_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model_xgboost)
                     ])
    pipeline_xgb.fit(X_train, y_train)
    predictions_valid = pipeline_xgb.predict(X_valid)
    test_predictions = pipeline_xgb.predict(X_test)
    final_predictions.append(test_predictions)
    rmse = mean_squared_error(y_valid, predictions_valid, squared = False)
    scores.append(rmse)
    print("RMSE: ", fold, rmse)
print("Final Scores")
final_scores_mean.append(np.mean(scores))
final_scores_std.append(np.std(scores))
print(np.mean(scores), np.std(scores))

In [None]:
df_scores = pd.DataFrame({'Mean Scores': final_scores_mean, 'Standard Deviation': final_scores_std})
print(df_scores)

In [None]:
print("Best Score Possible after applying feature engineering methods:")
print(min(final_scores_mean))