In [None]:
# Familiar imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil # round numbers up
%matplotlib inline
# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# For training random forest model
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

import warnings
warnings.simplefilter(action='ignore', category=Warning)

### Step 2: Load the data
Next, we'll load the training and test data.

In [None]:
# Load the training data
train = pd.read_csv("../input/30-days-of-ml/train.csv", index_col=0)
test = pd.read_csv("../input/30-days-of-ml/test.csv", index_col=0)

# Preview the data
train.head()

### EDA
let's see the distribution of the target variable

In [None]:
bins = np.arange(0, 12, 0.1)
sns.displot(train.target, height = 5, aspect = 2, bins = bins)

In [None]:
plt.figure(figsize= (20, 15))
# Mask to hide upper-right part of plot as it is a duplicate
mask = np.transpose(np.tril(np.ones(train.corr().shape)))
sns.heatmap(train.corr(), annot = True, center = 0, cmap = 'RdBu', mask = mask)

We can see that target is weakly correlated with all features

In [None]:
num_cols = [col for col in train.columns if 'cont' in col] 
num_cols

In [None]:
def plot(data, cols, features_type, nrows, ncols, bins='auto', target=None, figsize=None,
         hspace=None, wspace=None, color = None):
    '''plot all features vs target or the distribution of features'''
    if figsize != None:
        plt.figure(figsize = figsize)
    for col, plot_num in zip(cols, list(range(1, len(cols)))):
        plt.subplot(nrows, ncols, plot_num)
        if hspace != None or wspace != None:
            plt.subplots_adjust(hspace = hspace, wspace = wspace)
            
        if features_type == 'numerical':
            if target != None:
                plt.scatter(data[col], data[target])
                plt.title(col)
            else:
                sns.histplot(data[col], bins=bins)
                
        if features_type == 'categorical':
            if target != None:
                sns.violinplot(data=data, y=col, x=target, color=color, inner='quartile');
            else:
                countplot_ratio(x = col, data = data, color = color)

Distributions of numerical features

In [None]:
n_cols = 4
n_rows = ceil(len(num_cols)/n_cols)
bins = np.arange(0, 1.3, 0.02)
plot(data=train, cols=num_cols, features_type='numerical', nrows=n_rows, ncols=n_cols, hspace=0.3, wspace=0.5, bins=bins,
    figsize = (15, 15))

let's see the distribution of continuous variable vs target

In [None]:
n_cols = 4
n_rows = ceil(len(num_cols)/n_cols)
plot(data=train, target='target', cols=num_cols, features_type='numerical', nrows=n_rows, ncols=n_cols, hspace=0.3,
    figsize = (15, 15))

In [None]:
# List of categorical columns
object_cols = [col for col in train.columns if 'cat' in col]
object_cols

Distributions of categorical features

In [None]:
# function to plot the distribution of categorical variable 
# since the countplot function show the counts of observations in each categorical bin using bars.
def countplot_ratio(x = None, data = None, hue = None, ax = None, color = None):
    # plot the variable
    ax = sns.countplot(x, data = data, hue = hue, ax = ax, color = color)
    # names of x labels
    ax.set_xticklabels(ax.get_xticklabels())
    # plot title
    ax.set_title(x + " Distribution")
    # total number of data which used to get the proportion
    total = float(len(data))
    # for loop to iterate on the patches
    for patch in ax.patches:
        # get the height of the patch which represents the number of observations.
        height = patch.get_height()
        # Put text on each patch with the proportion of the observations
        ax.text(patch.get_x()+patch.get_width()/2,height+4,'{:.2f}%'.format((height/total)*100),weight = 'bold',
                fontsize = 12,ha = 'center')

In [None]:
n_cols = 2
n_rows = ceil(len(object_cols)/n_cols)
base_color = sns.color_palette(n_colors=2)[1]
plot(data=train, cols=object_cols, features_type='categorical', nrows=n_rows, ncols=n_cols,
     hspace=0.5, figsize = (15, 20), color=base_color)

In [None]:
n_cols = 3
n_rows = ceil(len(object_cols)/n_cols)
plot(data=train, target='target', cols=object_cols, features_type='categorical',
     nrows=n_rows, ncols=n_cols, hspace=0.5, figsize = (15, 20), color=base_color)

The next code cell separates the target (which we assign to y) from the training features (which we assign to features).

In [None]:
# Separate target from features
y = train['target']
features = train.drop(['target'], axis=1)

# Preview features
features.head()

### Step 3: Prepare the data
Next, we'll need to handle the categorical columns (cat0, cat1, ... cat9).

In [None]:
# ordinal-encode categorical columns
X = features.copy()
X_test = test.copy()
ordinal_encoder = OrdinalEncoder()
X[object_cols] = ordinal_encoder.fit_transform(features[object_cols])
X_test[object_cols] = ordinal_encoder.transform(test[object_cols])

# Preview the ordinal-encoded features
X.head()

Next, we break off a validation set from the training data.

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

### Step 4: Train a model
Now that the data is prepared, the next step is to train a model.

In [None]:
# Define the model 
model = XGBRegressor(n_estimators=1000, learning_rate=0.03, random_state=1, n_jobs=2)

# Train the model 
model.fit(X_train, y_train, early_stopping_rounds = 20, eval_set=[(X_valid, y_valid)], verbose=False)
preds_valid = model.predict(X_valid)
print(mean_squared_error(y_valid, preds_valid, squared=False))

In the code cell above, we set squared=False to get the root mean squared error (RMSE) on the validation data.

### Step 5: Submit to the competition
We'll begin by using the trained model to generate predictions, which we'll save to a CSV file.

In [None]:
# Use the model to generate predictions
predictions = model.predict(X_test)

# Save the predictions to a CSV file
output = pd.DataFrame({'Id': X_test.index,
                       'target': predictions})
output.to_csv('submission.csv', index=False)