# 1. Loading libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# 2. Loading datasets

In [3]:
train_df = pd.read_csv('train.csv')
train_df

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8405,107.39,ak,s,as,c,d,aa,d,q,...,1,0,0,0,0,0,0,0,0,0
4205,8406,108.77,j,o,t,d,d,aa,h,h,...,0,1,0,0,0,0,0,0,0,0
4206,8412,109.22,ak,v,r,a,d,aa,g,e,...,0,0,1,0,0,0,0,0,0,0
4207,8415,87.48,al,r,e,f,d,aa,l,u,...,0,0,0,0,0,0,0,0,0,0


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


In [5]:
test_df = pd.read_csv('test.csv')
test_df

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8410,aj,h,as,f,d,aa,j,e,0,...,0,0,0,0,0,0,0,0,0,0
4205,8411,t,aa,ai,d,d,aa,j,y,0,...,0,1,0,0,0,0,0,0,0,0
4206,8413,y,v,as,f,d,aa,d,w,0,...,0,0,0,0,0,0,0,0,0,0
4207,8414,ak,v,as,a,d,aa,c,q,0,...,0,0,1,0,0,0,0,0,0,0


In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 377 entries, ID to X385
dtypes: int64(369), object(8)
memory usage: 12.1+ MB


# 3. Checking for null values

In [7]:
train_df.isnull().sum()

ID      0
y       0
X0      0
X1      0
X2      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 378, dtype: int64

In [8]:
test_df.isnull().sum()

ID      0
X0      0
X1      0
X2      0
X3      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 377, dtype: int64

- We can see that there are no null values.
- We can drop the 'ID' column as it is not a feature relevant for regression.


In [9]:
train_df.drop('ID', inplace = True, axis = 1)
test_df.drop('ID', inplace = True, axis = 1)

In [10]:
#Finding columns with zero variance
zero_var=train_df.var()[train_df.var()==0].index.values
zero_var

array(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290',
       'X293', 'X297', 'X330', 'X347'], dtype=object)

In [11]:
#Drop zero variance columns as they are not relevant for PCA
train_df.drop(zero_var, inplace=True, axis=1)
test_df.drop(zero_var, inplace=True, axis=1)

We can check for the number of remaining feature columns with respective data types.

In [123]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 365 entries, y to X385
dtypes: float64(1), int64(356), object(8)
memory usage: 11.7+ MB


In [124]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 364 entries, X0 to X385
dtypes: int64(356), object(8)
memory usage: 11.7+ MB


# 4. Checking for unique values

In [125]:
for i in train_df.columns:
    print(f"Column { i}",train_df[i].unique())

Column y [130.81  88.53  76.26 ...  85.71 108.77  87.48]
Column X0 ['k' 'az' 't' 'al' 'o' 'w' 'j' 'h' 's' 'n' 'ay' 'f' 'x' 'y' 'aj' 'ak' 'am'
 'z' 'q' 'at' 'ap' 'v' 'af' 'a' 'e' 'ai' 'd' 'aq' 'c' 'aa' 'ba' 'as' 'i'
 'r' 'b' 'ax' 'bc' 'u' 'ad' 'au' 'm' 'l' 'aw' 'ao' 'ac' 'g' 'ab']
Column X1 ['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab']
Column X2 ['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar']
Column X3 ['a' 'e' 'c' 'f' 'd' 'b' 'g']
Column X4 ['d' 'b' 'c' 'a']
Column X5 ['u' 'y' 'x' 'h' 'g' 'f' 'j' 'i' 'd' 'c' 'af' 'ag' 'ab' 'ac' 'ad' 'ae'
 'ah' 'l' 'k' 'n' 'm' 'p' 'q' 's' 'r' 'v' 'w' 'o' 'aa']
Column X6 ['j' 'l' 'd' 'h' 'i' 'a' 'g' 'c' 'k' 'e' 'f' 'b']
Column X8 ['o' 'x' 'e' 'n' 's' 'a' 'h' 'p' 'm' 'k' 'd' 'i' 'v' 'j' 'b' 'q' 'w' 'g'
 'y' 'l' 'f' 'u' 'r' 't' 'c']
Column 

From the output above we see that except for first 9 columns, all contain 0 and 1.

# 5. Import Encoder
There are certain advantages of using `OrdinalEncoder()`:  
- There are categorical variables in test data not present in training data, `OrdinalEncoder()` provides way to handle them via `handle_unknown` argument.  
- One can encode multidimensional dataframe object thus avoiding looping over all columns successively.

In [126]:
from sklearn.preprocessing import OrdinalEncoder
enc=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# 6. Create index of object type columns

In [127]:
encode_col = train_df.select_dtypes(include=['object', 'category']).columns
encode_col

Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], dtype='object')

# 7. Fit and Transform categorical features

In [128]:
# Only need to fit object type features
enc.fit(train_df[encode_col])

train_df[encode_col]=enc.transform(train_df[encode_col])
# transform test data from the mappings learned during fitting training data.
test_df[encode_col]=enc.transform(test_df[encode_col])


In [129]:
#convert back to DataFrame as OrdinalEncoder outputs array object.
train_df=pd.DataFrame(train_df)
train_df

Unnamed: 0,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,130.81,32.0,23.0,17.0,0.0,3.0,24.0,9.0,14.0,0,...,0,0,1,0,0,0,0,0,0,0
1,88.53,32.0,21.0,19.0,4.0,3.0,28.0,11.0,14.0,0,...,1,0,0,0,0,0,0,0,0,0
2,76.26,20.0,24.0,34.0,2.0,3.0,27.0,9.0,23.0,0,...,0,0,0,0,0,0,1,0,0,0
3,80.62,20.0,21.0,34.0,5.0,3.0,27.0,11.0,4.0,0,...,0,0,0,0,0,0,0,0,0,0
4,78.02,20.0,23.0,34.0,5.0,3.0,12.0,3.0,13.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,107.39,8.0,20.0,16.0,2.0,3.0,0.0,3.0,16.0,0,...,1,0,0,0,0,0,0,0,0,0
4205,108.77,31.0,16.0,40.0,3.0,3.0,0.0,7.0,7.0,0,...,0,1,0,0,0,0,0,0,0,0
4206,109.22,8.0,23.0,38.0,0.0,3.0,0.0,6.0,4.0,0,...,0,0,1,0,0,0,0,0,0,0
4207,87.48,9.0,19.0,25.0,5.0,3.0,0.0,11.0,20.0,0,...,0,0,0,0,0,0,0,0,0,0


In [130]:
test_df=pd.DataFrame(test_df)
test_df

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,20.0,23.0,34.0,5.0,3.0,-1.0,0.0,22.0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,40.0,3.0,7.0,0.0,3.0,-1.0,6.0,24.0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,20.0,23.0,16.0,5.0,3.0,-1.0,9.0,9.0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,20.0,13.0,34.0,5.0,3.0,-1.0,11.0,13.0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,43.0,20.0,16.0,2.0,3.0,28.0,8.0,12.0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,7.0,9.0,16.0,5.0,3.0,0.0,9.0,4.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4205,40.0,1.0,7.0,3.0,3.0,0.0,9.0,24.0,0,0,...,0,1,0,0,0,0,0,0,0,0
4206,45.0,23.0,16.0,5.0,3.0,0.0,3.0,22.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4207,8.0,23.0,16.0,0.0,3.0,0.0,2.0,16.0,0,0,...,0,0,1,0,0,0,0,0,0,0


# 8. Import PCA

In [140]:
from sklearn.decomposition import PCA
pca=PCA(n_components = 0.98,svd_solver='full')

In [141]:
#create feature and target variables
X= train_df.drop('y',axis=1)
y=train_df['y']

In [142]:
#split into training and validation set
X_train , X_val , y_train , y_val = train_test_split(X,y,test_size=0.2,random_state=42)

In [143]:
#fit PCA
pca.fit(X)

PCA(n_components=0.98, svd_solver='full')

In [145]:
pca.n_components_

14

We obtain 14 principal components. We can now observe the proportion of the dataset's variance that each principal component accounts for.

In [146]:
pca.explained_variance_ratio_

array([0.38334782, 0.21388033, 0.13261866, 0.11826642, 0.09206008,
       0.01590604, 0.0074454 , 0.00433701, 0.00294021, 0.00241796,
       0.00236488, 0.00203229, 0.00167204, 0.00148111])

In [147]:
#Perform dimensionality reduction
pca_X_train = pd.DataFrame(pca.transform(X_train))
pca_X_val = pd.DataFrame(pca.transform(X_val))
pca_test = pd.DataFrame(pca.transform(test_df))

In [148]:
#Create XGBoost Regression model
model = xgb.XGBRegressor(objective='reg:linear',learning_rate=0.1)

In [149]:
#Fit the model
model.fit(pca_X_train,y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=2,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [154]:
#Obtaining predicted target variable
pred_y_val = model.predict(pca_X_val)
Predicted_Data = pd.DataFrame()
Predicted_Data['y'] = pred_y_val
Predicted_Data.head()

Unnamed: 0,y
0,93.836418
1,96.475525
2,107.569084
3,78.740082
4,107.925537


In [151]:
#Final Loss value
mse_score = mean_squared_error(y_val,pred_y_val)

In [152]:
#print loss value
print(mse_score)

82.3270553095433
