In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,make_scorer
from xgboost import XGBRegressor

In [None]:
df = pd.read_csv("../input/train-folds-5/train_folds.csv")
df_test=pd.read_csv("../input/30-days-of-ml/test.csv")
submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
#PART I: EXPLORATORY DATA ANALYSIS

In [None]:
df.head()

In [None]:
df_test.head()

In [None]:
# Return any column with missing values. No columns with missing values found
df.columns[df.isnull().any()]

In [None]:
 #Histograms for each var
df_hist = df.hist(bins=10,figsize=(10,10))

In [None]:
#Creating a heatmap to show correlation
fig,axes = plt.subplots(1,1,figsize=(16,14))
sns.heatmap(df.corr(),annot=True, cmap="RdYlGn")
plt.show()

In [None]:
# Select categorical vars only

df_cat = df.select_dtypes(include = 'object').copy()
# counts of each var value
df_cat.nunique()

In [None]:
# Calculating the numbers of each unique values for each categorical var using lambda expression
df_cat.apply(lambda x:x.value_counts()).T.stack()

In [None]:
# creating a var for useful features which are not 'id', 'kfold' or 'target', extract the features we don't want to use
kept_features = [useful_cols for useful_cols in df.columns if useful_cols not in ('id','kfold','target')]
# creating a var for columns need to be encoded (object columns)
object_cols = [col for col in kept_features if 'cat' in col]
# creating a var for numerical columns for feature engineering
numerical_cols = [col for col in kept_features if 'cont' in col]
# removing 'id','target' and 'kfold' from df_test, convert our test df to useful features only 
df_test=df_test[kept_features]

In [None]:
xgb_params={'colsample_bytree': 0.1,
            'learning_rate': 0.0752111846201471,
            'max_depth': 5,
            'n_estimators': 1000,
            'reg_alpha': 1e-09,
            'reg_lambda': 100.0,
            'subsample': 1.0,
            'min_child_weight':6,
            'booster' : 'gbtree',
            'tree_method' :'gpu_hist',
            'predictor': 'gpu_predictor',
            'gpu_id' : 0,
            'scale_pos_weight' :1,
            'predictor': 'gpu_predictor',
            'early_stopping_rounds' : 5,
            'eval_metric':'rmse',
            'gamma' : 0,
            #Subsample ratio of the training instances, 
            #Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting. Subsampling will occur once in every boosting iteration.
            'subsample': 0.96,
            
           }

In [None]:
# PART II: FITTING THE MODEL AND MAKE PREDICTIONS

In [None]:
#creating a list for final predictions
final_predictions = []
scores = []
# creating a for loop to loop over fold, reserving -1 fold for training data
for fold in range(5):
    # training data that is not at fold
    x_train = df[df.kfold != fold].reset_index(drop=True)
    #training data that is at fold
    x_valid = df[df.kfold == fold ].reset_index(drop=True)
    # making a copy of the test set to avoid errors
    x_test = df_test.copy()
    # creating the training dataset, validation dataset and test dataset
    y_train= x_train.target
    y_valid = x_valid.target
    
    x_train = x_train[kept_features]
    x_valid = x_valid[kept_features]
     # ordinal-encode categorical columns
    OE = preprocessing.OrdinalEncoder()
    
    # always fit_transform on the training data
    x_train[object_cols] = OE.fit_transform(x_train[object_cols])
    
    #transform on the validation and test sets
    x_valid[object_cols] = OE.transform(x_valid[object_cols])
    x_test[object_cols] = OE.transform(x_test[object_cols])
    
    # Feature engineering for numerical var : standardisation
    
    scaler = preprocessing.StandardScaler()
    # standardise training data, using .fit_transform()
    x_train[numerical_cols] = scaler.fit_transform(x_train[numerical_cols])
    #standardise validation and test data using .transform()
    x_valid[numerical_cols] = scaler.transform(x_valid[numerical_cols])
    x_test[numerical_cols] = scaler.transform(x_test[numerical_cols])
    #using XGBRegressor instead since rf takes a long time to run
    model = XGBRegressor(**xgb_params)
    model.fit(x_train, y_train)
    preds_valid = model.predict(x_valid)
    test_preds = model.predict(x_test)
    final_predictions.append(test_preds)
    RMSE = mean_squared_error(y_valid, preds_valid, squared=False)
    print(fold,RMSE)
    scores.append(RMSE)

    print (np.mean(scores),np.std(scores))
    

In [None]:
#making predictions, taking the mean of the predictions of all 5 models
preds = np.mean(np.column_stack(final_predictions), axis=1)
preds 

In [None]:
# creating a submission
submission.target = preds
submission.to_csv("submission.csv", index=False)