In [None]:
import numpy as np
import pandas as pd
import seaborn as sns 
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',500)
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [None]:
df_train=pd.read_csv("../input/30-days-of-ml/train.csv")
df_test=pd.read_csv("../input/30-days-of-ml/test.csv")
submission=pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
df_train.shape

In [None]:
df_test.shape

There are 300000 rows in training data and 200000 rows in test data.

****Checking for Null values****

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

We can see clearly that there are no missing values in both the data sets i.e. train and test set.

In [None]:
df_train.describe()

In [None]:
cat_col=[col for col in df_train.columns if 'cat' in col]
print(cat_col)
print(len(cat_col))


In [None]:
num_col=[col for col in df_train.columns if 'cat' not in col]
print(num_col)
print(len(num_col))

There are total 26 columns, out of which 10 are categorical and 16 are numerical columns.

In [None]:
df_train.target.hist()

In [None]:
sns.kdeplot(df_train.target)

In [None]:
sns.displot(df_train.target)

In [None]:
df_train.target.value_counts()

As we can see that most of the values are between 6 and 10.

In [None]:
corr=df_train.corr()

In [None]:
plt.figure(figsize=(14,12))
sns.heatmap(corr,annot=True,cbar='viridis')

****Conclusion****

None of the columns are highly correlated to each other.

I am using the data from Abhishek Thakur's datasets with K-Folds.

In [None]:
df=pd.read_csv("../input/30days-folds/train_folds.csv")

In [None]:
df.head()

There is a last column kfold with the folds mentioned.


In [None]:
df.kfold.value_counts()

So there are 5 K-Folds in total with 60000 data in each fold.

Segregate the categorical and numerical columns.

In [None]:
imp_col=[col for col in df.columns if col not in ('id','target','kfold')]
print(imp_col)
categorical_col=[col for col in imp_col if 'cat' in col]
numerical_col=[col for col in imp_col if 'cat' not in col]
print(categorical_col)
print(numerical_col)
df_test=df_test[imp_col]

In [None]:
# ordinal encoder + standardisation
final_pred=[]
scores=[]
for folds in range(5):
    X_train=df[df.kfold!=folds].reset_index()
    X_valid=df[df.kfold==folds].reset_index()
    y_train=X_train.target
    y_valid=X_valid.target
    
    X_train=X_train[imp_col]
    X_valid=X_valid[imp_col]
    X_test=df_test.copy()
    
    ordinal_encoder = OrdinalEncoder()
    X_train[categorical_col] = ordinal_encoder.fit_transform(X_train[categorical_col])
    X_valid[categorical_col] = ordinal_encoder.transform(X_valid[categorical_col])
    X_test[categorical_col] = ordinal_encoder.transform(X_test[categorical_col])
    
    scaler= StandardScaler()
    X_train[numerical_col] = scaler.fit_transform(X_train[numerical_col])
    X_valid[numerical_col] = scaler.transform(X_valid[numerical_col])
    X_test[numerical_col] = scaler.transform(X_test[numerical_col])
    
    model1=XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=folds, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model1.fit(X_train,y_train)
    valid_pred=model1.predict(X_valid)
    test_pred= model1.predict(X_test)
    final_pred.append(test_pred)
    rmse = mean_squared_error(y_valid, valid_pred, squared=False)
    print(folds, rmse)
    scores.append(rmse)
print(np.mean(scores), np.std(scores))

***Conclusion***

Here we obtained mean score of 0.722109 with standard deviation of 0.001005.

Beginners can take help from this notebook! If you liked my work, please upvote!