#### IMPORTING DATA SET AND FINDING THE DATA DETAILS

In [1]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

In [2]:
# importing the data
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")

In [3]:
data_train.shape

(4209, 378)

In [4]:
data_test.shape

(4209, 377)

In [5]:
data_train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data_test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [7]:
x_columns = [c for c in data_train.columns if 'X' in c]
print('Number of features: {}'.format(len(x_columns)))

Number of features: 376


In [8]:
print('Feature types:')
data_train[x_columns].dtypes.value_counts()

Feature types:


int64     368
object      8
dtype: int64

#### FINDING THE COUNT OF DIFFERENT FEATURES

In [9]:
counts = [[], [], []]
for columns in x_columns:
    typ = data_train[columns].dtype
    uniq = len(np.unique(data_train[columns]))
    if uniq == 1:
        counts[0].append(columns)
    elif uniq == 2 and typ == np.int64:
        counts[1].append(columns)
    else:
        counts[2].append(columns)

In [10]:
print('Constant features: {} Binary features: {} Categorical features: {}\n'
      .format(*[len(columns) for columns in counts]))
print('Constant features: ',counts[0])
print('Categorical features:', counts[2])

Constant features: 12 Binary features: 356 Categorical features: 8

Constant features:  ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']
Categorical features: ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']


#### DROPPING THE COLUMNS NOT REQUIRED FOR ANALYSIS

In [48]:
x_tr = data_train.drop(['ID','y'], axis = 1)
x_tr.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,k,v,at,a,d,u,j,o,0,0,...,0,0,1,0,0,0,0,0,0,0
1,k,t,av,e,d,y,l,o,0,0,...,1,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,x,0,0,...,0,0,0,0,0,0,1,0,0,0
3,az,t,n,f,d,x,l,e,0,0,...,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,n,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
y = data_train['y']
y.head()

0    130.81
1     88.53
2     76.26
3     80.62
4     78.02
Name: y, dtype: float64

### Q1. Removing Columns with zero Variance ie., with constant features columns

In [50]:
x_tr = x_tr.drop(['X11', 'X93', 'X107', 'X233', 'X235',
                         'X268', 'X289', 'X290', 'X293', 'X297', 
                         'X330', 'X347'], axis = 1)

In [51]:
x_tr.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,k,v,at,a,d,u,j,o,0,0,...,0,0,1,0,0,0,0,0,0,0
1,k,t,av,e,d,y,l,o,0,0,...,1,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,x,0,0,...,0,0,0,0,0,0,1,0,0,0
3,az,t,n,f,d,x,l,e,0,0,...,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,n,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
x_te = data_test.drop(['ID', 'X11', 'X93', 'X107', 'X233', 'X235',
                         'X268', 'X289', 'X290', 'X293', 'X297', 
                         'X330', 'X347'], axis = 1)

In [55]:
x_te.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,az,v,n,f,d,t,a,w,0,0,...,0,0,0,1,0,0,0,0,0,0
1,t,b,ai,a,d,b,g,y,0,0,...,0,0,1,0,0,0,0,0,0,0
2,az,v,as,f,d,a,j,j,0,0,...,0,0,0,1,0,0,0,0,0,0
3,az,l,n,f,d,z,l,n,0,0,...,0,0,0,1,0,0,0,0,0,0
4,w,s,as,c,d,y,i,m,0,0,...,1,0,0,0,0,0,0,0,0,0


### Q2. Checking for Null values and Unique values

In [15]:
def check_missing_values(data_train):
    if data_train.isnull().any().any():
        print("There are missing values in the data_train dataframe")
    else:
        print("There are no missing values in the data_train dataframe")
check_missing_values(data_train)     

There are no missing values in the data_train dataframe


In [16]:
categorical_columns = ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']
#Print frequency of categories
for col in categorical_columns:
    print ('\nFrequency of Categories for varible %s'%col)
    print (data_train[col].value_counts())


Frequency of Categories for varible X0
z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
j     181
az    175
aj    151
s     106
ap    103
h      75
d      73
al     67
v      36
af     35
ai     34
m      34
e      32
ba     27
at     25
a      21
ax     19
i      18
aq     18
am     18
u      17
aw     16
l      16
ad     14
b      11
k      11
au     11
as     10
r      10
bc      6
ao      4
c       3
q       2
aa      2
ac      1
g       1
ab      1
Name: X0, dtype: int64

Frequency of Categories for varible X1
aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
w      52
z      46
u      37
e      33
m      32
t      31
h      29
f      23
y      23
j      22
n      19
k      17
p       9
g       6
d       3
q       3
ab      3
Name: X1, dtype: int64

Frequency of Categories for varible X2
as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f     

In [17]:
def check_missing_values(data_test):
    if data_test.isnull().any().any():
        print("There are missing values in the data_test dataframe")
    else:
        print("There are no missing values in the data_test dataframe")
check_missing_values(data_test)

There are no missing values in the data_test dataframe


In [18]:
categorical_columns = ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']
#Print frequency of categories
for col in categorical_columns:
    print ('\nFrequency of Categories for varible %s'%col)
    print (data_test[col].value_counts())


Frequency of Categories for varible X0
ak    432
y     348
z     335
x     302
ay    299
t     293
o     246
f     213
w     198
j     171
n     167
aj    162
az    161
s     116
ap    108
al     88
h      64
d      61
e      48
v      40
ai     38
m      34
af     34
am     28
i      25
at     21
u      20
ba     19
a      18
b      13
k      12
ad     12
aw     11
aq     11
r      10
ax      8
l       6
as      6
bc      6
c       6
au      5
ao      5
g       3
ag      1
ae      1
an      1
av      1
p       1
bb      1
Name: X0, dtype: int64

Frequency of Categories for varible X1
aa    826
s     602
l     599
b     596
v     436
r     252
i     189
a     153
c     142
o      81
w      50
u      40
z      31
e      29
h      27
m      27
j      22
y      21
t      18
n      16
k      12
f      12
p      10
g       9
ab      5
q       3
d       1
Name: X1, dtype: int64

Frequency of Categories for varible X2
as    1658
ae     478
ai     462
m      348
ak     260
r      155
n      1

### Q3. Applying Label Encoder for both Train and Test Dataset

In [19]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [56]:
le = LabelEncoder()
x_tr['X0'] = le.fit_transform(x_tr['X0'])
x_tr['X1'] = le.fit_transform(x_tr['X1'])
x_tr['X2'] = le.fit_transform(x_tr['X2'])
x_tr['X3'] = le.fit_transform(x_tr['X3'])
x_tr['X4'] = le.fit_transform(x_tr['X4'])
x_tr['X5'] = le.fit_transform(x_tr['X5'])
x_tr['X6'] = le.fit_transform(x_tr['X6'])
x_tr['X8'] = le.fit_transform(x_tr['X8'])

In [58]:
x_tr.dtypes.head(10)

X0     int32
X1     int32
X2     int32
X3     int32
X4     int32
X5     int32
X6     int32
X8     int32
X10    int64
X12    int64
dtype: object

In [59]:
x_tr.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,32,23,17,0,3,24,9,14,0,0,...,0,0,1,0,0,0,0,0,0,0
1,32,21,19,4,3,28,11,14,0,0,...,1,0,0,0,0,0,0,0,0,0
2,20,24,34,2,3,27,9,23,0,0,...,0,0,0,0,0,0,1,0,0,0
3,20,21,34,5,3,27,11,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,23,34,5,3,12,3,13,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
le = LabelEncoder()
x_te['X0'] = le.fit_transform(x_te['X0'])
x_te['X1'] = le.fit_transform(x_te['X1'])
x_te['X2'] = le.fit_transform(x_te['X2'])
x_te['X3'] = le.fit_transform(x_te['X3'])
x_te['X4'] = le.fit_transform(x_te['X4'])
x_te['X5'] = le.fit_transform(x_te['X5'])
x_te['X6'] = le.fit_transform(x_te['X6'])
x_te['X8'] = le.fit_transform(x_te['X8'])

In [61]:
x_te.dtypes.head(10)

X0     int32
X1     int32
X2     int32
X3     int32
X4     int32
X5     int32
X6     int32
X8     int32
X10    int64
X12    int64
dtype: object

In [62]:
x_te.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,21,23,34,5,3,26,0,22,0,0,...,0,0,0,1,0,0,0,0,0,0
1,42,3,8,0,3,9,6,24,0,0,...,0,0,1,0,0,0,0,0,0,0
2,21,23,17,5,3,0,9,9,0,0,...,0,0,0,1,0,0,0,0,0,0
3,21,13,34,5,3,31,11,13,0,0,...,0,0,0,1,0,0,0,0,0,0
4,45,20,17,2,3,30,8,12,0,0,...,1,0,0,0,0,0,0,0,0,0


### Q4.  Performing Dimensionality Reduction using PCA

#### Dropping the categorical columns

In [100]:
pca_x_tr = x_tr.drop(['X0', 'X1', 'X2', 'X3', 'X4','X5', 'X6', 'X8'], axis = 1)
                         
pca_x_tr.shape

(4209, 356)

In [119]:
pca_x_te = x_te.drop(['X0', 'X1', 'X2', 'X3', 'X4','X5', 'X6', 'X8'], axis = 1)
                         
pca_x_te.shape

(4209, 356)

In [101]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(pca_x_tr, y, test_size=0.2, random_state=1)
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(3367, 356)
(842, 356)
(3367,)
(842,)


In [109]:
from sklearn.decomposition import PCA
sklearn_pca = PCA(n_components= 0.95)
sklearn_pca.fit(x_train)

PCA(n_components=0.95)

In [121]:
pca.n_components_

12

In [122]:
pca.explained_variance_ratio_

array([0.13642787, 0.09702585, 0.09065984, 0.07072106, 0.06016394,
       0.04951177, 0.04145982, 0.03467996, 0.02906683, 0.02591684,
       0.02549303, 0.02083841])

sklearn_pca.explained_variance_ratio_

In [120]:
x_train_transformed = sklearn_pca.transform(x_train)

print(x_train.shape)
print(x_train_transformed.shape)
print(x_val.shape)


x_val_transformed =sklearn_pca.transform(x_val)
print(x_val_transformed.shape)

x_test_transformed =sklearn_pca.transform(pca_x_te)
print(pca_x_te.shape)
print(x_test_transformed.shape)

(3367, 356)
(3367, 71)
(842, 356)
(842, 71)
(4209, 356)
(4209, 71)


# Hence we reduced the dimensions of 
### Training data from 356 to 71 components 
### Validation data from 356 to 71 components for 95% explained variance

   

### Q5. Predicting the test_df values using XGBoost

In [31]:
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [126]:
model = xgb.XGBRegressor(objective = 'reg:linear', learning_rate = 0.1)
model.fit(x_train_transformed,y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=2, num_parallel_tree=1,
             objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [131]:
pred_y_val = model.predict(x_val_transformed)
print(pred_y_val)

[ 78.03309  113.05126  109.71403  101.68544   78.215294  93.654884
  90.83585   93.17285   91.546005  92.464096  80.143265  91.84404
 101.107605  93.66145   91.33606   94.66493  110.263    100.276505
 103.33485  109.81357  112.06824  103.780754  97.823     94.36754
 103.88336  116.18018   97.03719  116.87933   97.38296   92.71837
 114.81182  116.74172   95.08109  111.943825  96.37832   91.115135
  93.316574  78.66856   95.91227  112.433685 117.06427  109.24644
 109.976166  92.71908   94.66493  104.967224  94.10756  108.74544
  98.101326 109.5145    94.82296  111.02166   99.27299   96.382935
  95.838844 103.60258   91.115616  94.14897  100.831535  97.823
  93.94997   95.51436   99.965996 113.7451    95.03686   94.71697
  95.26091  113.827034  91.48711   91.33606  112.801     95.422134
  93.08083   95.03507  111.12442   94.59778  136.37294  104.967224
 113.42054   90.72338   91.75875   90.322945  95.79287   97.60802
  94.0827    94.599396  91.19249  108.59565  110.03333  112.21324
 110.2

### END OF THE PROJECT

##  THANK YOU Mr.MOHAMMED WAJAHAT and the Simplilearn team for the extraordinary guidance and support