In [9]:
# Importing libraries
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA


# Importing train.csv
df_train = pd.read_csv('train.csv')

print('Size of training set: {} rows and {} columns'.format(*df_train.shape))
print('-------------------------------------------------------------------------')

df_train.head()


# Collecting Y values in one array
y_train = df_train['y'].values


# Iterating through columns with X in their name
cols = [c for c in df_train.columns if 'X' in c]
print('Number of features: {}'.format(len(cols)))

print('Feature types:')
df_train[cols].dtypes.value_counts()

# Counting data in each columns
count = [[],[],[]]
for c in cols:
    typp = df_train[c].dtype
    unq = len(np.unique(df_train[c]))
    if unq == 1:
        count[0].append(c)
    elif unq == 2 and typp == np.int64:
        count[1].append(c)
    else:
        count[2].append(c)
        
print('Constant features: {} | Binary features: {} | Categorical features: {}\n'
      .format(*[len(c) for c in count]))
print('Constant features:', count[0]) #constant features can also be called as useless features
print('Categorical features:', count[2])
print('--------------------------------------------------------------------------')


# Importing test.csv
df_test = pd.read_csv('test.csv')

useful_columns = list(set(df_train.columns) - set(['ID', 'y']))
y_train = df_train['y'].values
id_test = df_test['ID'].values

x_train = df_train[useful_columns]
x_test = df_test[useful_columns]


# Checking for null and unique values in train and test datasets
def missing_values(df):
    if df.isnull().any().any():
        print('Missing values present')
    else:
        print('NO missing values present')

missing_values(x_train)
missing_values(x_test)


# For any columns, if variance is zero, remove that columns
# Apply LabelEncoder
for column in useful_columns:
    vari = len(np.unique(x_train[column]))
    if vari == 1:
        x_train.drop(column, axis=1)
        x_test.drop(column, axis=1)
    if vari > 2:
        map = lambda x: sum([ord(digit) for digit in x])
        x_train[column] = x_train[column].apply(map)
        x_test[column] = x_test[column].apply(map)
x_train.head

print('Feature types:')
x_train[cols].dtypes.value_counts()


# Perform Dimensionality Reduction
n_comp = 12
pca = PCA(n_components=n_comp, random_state=420)
pcat_results_train = pca.fit_transform(x_train)
pcat_results_test = pca.transform(x_test)


# XGBoost
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

x_train, x_validation, y_train, y_validation = train_test_split(pcat_results_train, y_train, test_size=0.2, random_state=4242)

d_train = xgb.DMatrix(x_train, label=y_train)
d_validation = xgb.DMatrix(x_validation, label=y_validation)
d_test = xgb.DMatrix(pcat_results_test)

params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.02
params['max_depth'] = 4

def xgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds)

watchlist = [(d_train, 'train'), (d_validation, 'validation')]

clf = xgb.train(params, d_train,
                1000, watchlist, early_stopping_rounds=50,
                feval=xgb_r2_score, maximize=True, verbose_eval=10)



# Predict your df_test values using XGBoost
p_test = clf.predict(d_test)

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = p_test
sub.to_csv('xgb.csv', index=False)

sub.head()

Size of training set: 4209 rows and 378 columns
-------------------------------------------------------------------------
Number of features: 376
Feature types:
Constant features: 12 | Binary features: 356 | Categorical features: 8

Constant features: ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']
Categorical features: ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']
--------------------------------------------------------------------------
NO missing values present
NO missing values present


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[column] = x_train[column].apply(map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[column] = x_test[column].apply(map)


Feature types:
[0]	train-rmse:99.14834	validation-rmse:98.26296	train-r2:-58.35295	validation-r2:-67.63754
Multiple eval metrics have been passed: 'validation-r2' will be used for early stopping.

Will train until validation-r2 hasn't improved in 50 rounds.
[10]	train-rmse:81.27653	validation-rmse:80.36433	train-r2:-38.88428	validation-r2:-44.91014
[20]	train-rmse:66.71610	validation-rmse:65.77334	train-r2:-25.87403	validation-r2:-29.75260
[30]	train-rmse:54.86956	validation-rmse:53.88963	train-r2:-17.17751	validation-r2:-19.64393
[40]	train-rmse:45.24492	validation-rmse:44.21995	train-r2:-11.35979	validation-r2:-12.90012
[50]	train-rmse:37.44736	validation-rmse:36.37245	train-r2:-7.46669	validation-r2:-8.40431
[60]	train-rmse:31.14760	validation-rmse:30.01883	train-r2:-4.85761	validation-r2:-5.40575
[70]	train-rmse:26.08679	validation-rmse:24.90901	train-r2:-3.10878	validation-r2:-3.41057
[80]	train-rmse:22.04667	validation-rmse:20.83109	train-r2:-1.93466	validation-r2:-2.08465
[90]	t

Unnamed: 0,ID,y
0,1,82.515434
1,2,97.188736
2,3,82.554169
3,4,76.55217
4,5,112.846581
