### Following actions should be performed:

1. If for any column(s), the variance is equal to zero, then you need to remove those variable(s).
2. Check for null and unique values for test and train sets.
3. Apply label encoder.
4. Perform dimensionality reduction.
5. Predict your test_df values using XGBoost.

#### Step1: Import the required libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

#### Step2: Reading data from csv file (creating dataframe)

In [2]:
training_dataset = pd.read_csv('train.csv', index_col = 'ID')
testing_dataset = pd.read_csv('test.csv', index_col = 'ID')

In [3]:
# fetching the shape of data

print (training_dataset.shape)
print (testing_dataset.shape)

(4209, 377)
(4209, 376)


In [4]:
# view how the data looks by printing first 5 rows

training_dataset.head()

Unnamed: 0_level_0,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,130.81,k,v,at,a,d,u,j,o,0,...,0,0,1,0,0,0,0,0,0,0
6,88.53,k,t,av,e,d,y,l,o,0,...,1,0,0,0,0,0,0,0,0,0
7,76.26,az,w,n,c,d,x,j,x,0,...,0,0,0,0,0,0,1,0,0,0
9,80.62,az,t,n,f,d,x,l,e,0,...,0,0,0,0,0,0,0,0,0,0
13,78.02,az,v,n,f,d,h,d,n,0,...,0,0,0,0,0,0,0,0,0,0


#### Step3: Preparing X_train, y_train, X_test

In [5]:
X_train = training_dataset.drop(['y'], axis=1)
y_train = training_dataset['y']

X_test = testing_dataset

In [6]:
print(X_train.shape)
print(X_test.shape)

(4209, 376)
(4209, 376)


In [7]:
X_train.head()

Unnamed: 0_level_0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,k,v,at,a,d,u,j,o,0,0,...,0,0,1,0,0,0,0,0,0,0
6,k,t,av,e,d,y,l,o,0,0,...,1,0,0,0,0,0,0,0,0,0
7,az,w,n,c,d,x,j,x,0,0,...,0,0,0,0,0,0,1,0,0,0
9,az,t,n,f,d,x,l,e,0,0,...,0,0,0,0,0,0,0,0,0,0
13,az,v,n,f,d,h,d,n,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Step4: Data Preprocessing

In [8]:
X_train.dtypes.value_counts()

int64     368
object      8
dtype: int64

In [9]:
#find columns having different types of features

constant_features = []
binary_features = []
categroial_features = []
for c in X_train.columns:
    typ = X_train[c].dtype
    uniq = len(np.unique(X_train[c]))
    if uniq == 1:
        constant_features.append(c)
    elif uniq == 2 and typ == np.int64:
        binary_features.append(c)
    else:
        categroial_features.append(c)

print('Constant features:', constant_features)
print('Categorical features:', categroial_features)


Constant features: ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']
Categorical features: ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']


In [10]:
# Dropping the columns having variance Zero

for col in constant_features:
    X_train.drop(col,inplace=True,axis=1)
    X_test.drop(col,inplace=True,axis=1)

In [11]:
print(X_train.shape)
print(X_test.shape)

(4209, 364)
(4209, 364)


In [12]:
# number of independent variables (features) = total columns - 1
# exluding id

total_features = X_train.shape[1]-1
total_features

363

In [13]:
# Apply label encoder

for column in categroial_features:
    mapper = lambda x: sum([ord(digit) for digit in x])
    X_train[column] = X_train[column].apply(mapper)
    X_test[column] = X_test[column].apply(mapper)
    
X_train.head()

Unnamed: 0_level_0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,107,118,213,97,100,117,106,111,0,0,...,0,0,1,0,0,0,0,0,0,0
6,107,116,215,101,100,121,108,111,0,0,...,1,0,0,0,0,0,0,0,0,0
7,219,119,110,99,100,120,106,120,0,0,...,0,0,0,0,0,0,1,0,0,0
9,219,116,110,102,100,120,108,101,0,0,...,0,0,0,0,0,0,0,0,0,0
13,219,118,110,102,100,104,100,110,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
X_train.dtypes.value_counts()

int64    364
dtype: int64