#Import required libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import r2_score

#Read train data

In [2]:
df_train = pd.read_csv('train.csv')
print(f'Size of training set: {df_train.shape[0]} rows and {df_train.shape[1]} columns')
print(df_train.head())

Size of training set: 4209 rows and 378 columns
   ID       y  X0 X1  X2 X3 X4 X5 X6 X8  ...  X375  X376  X377  X378  X379  \
0   0  130.81   k  v  at  a  d  u  j  o  ...     0     0     1     0     0   
1   6   88.53   k  t  av  e  d  y  l  o  ...     1     0     0     0     0   
2   7   76.26  az  w   n  c  d  x  j  x  ...     0     0     0     0     0   
3   9   80.62  az  t   n  f  d  x  l  e  ...     0     0     0     0     0   
4  13   78.02  az  v   n  f  d  h  d  n  ...     0     0     0     0     0   

   X380  X382  X383  X384  X385  
0     0     0     0     0     0  
1     0     0     0     0     0  
2     0     1     0     0     0  
3     0     0     0     0     0  
4     0     0     0     0     0  

[5 rows x 378 columns]


#Collect Y values into an array

In [3]:
y_train = df_train['y'].values

#Understand data types

In [4]:
cols = [c for c in df_train.columns if 'X' in c]
print(f'Number of features: {len(cols)}')
print('Feature types:')
print(df_train[cols].dtypes.value_counts())

Number of features: 376
Feature types:
int64     368
object      8
dtype: int64


#Count data in each column

In [5]:
counts = [[], [], []]
for c in cols:
    typ = df_train[c].dtype
    uniq = len(np.unique(df_train[c]))
    if uniq == 1:
        counts[0].append(c)
    elif uniq == 2 and typ == np.int64:
        counts[1].append(c)
    else:
        counts[2].append(c)

print(f'Constant features: {len(counts[0])} Binary features: {len(counts[1])} Categorical features: {len(counts[2])}\n')
print('Constant features:', counts[0])
print('Categorical features:', counts[2])

Constant features: 12 Binary features: 356 Categorical features: 8

Constant features: ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']
Categorical features: ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']


#Read test data and preprocess

In [6]:
df_test = pd.read_csv('test.csv')
usable_columns = list(set(df_train.columns) - set(['ID', 'y']))
y_train = df_train['y'].values
id_test = df_test['ID'].values
x_train = df_train[usable_columns]
x_test = df_test[usable_columns]

#Check for null values

In [7]:
def check_missing_values(df):
    if df.isnull().any().any():
        print("There are missing values in the dataframe")
    else:
        print("There are no missing values in the dataframe")

check_missing_values(x_train)
check_missing_values(x_test)

There are no missing values in the dataframe
There are no missing values in the dataframe


#Remove columns with zero variance and apply label encoder

In [8]:
# Identify columns with non-zero variance
non_zero_variance_cols = x_train.columns[x_train.apply(pd.Series.nunique) != 1]
non_zero_variance_cols = list(set(non_zero_variance_cols) & set(x_test.columns))

# Check if any columns have non-zero variance
if len(non_zero_variance_cols) == 0:
    print("No columns with non-zero variance found.")
else:
    # Remove columns with zero variance
    x_train = x_train[non_zero_variance_cols]
    x_test = x_test[non_zero_variance_cols]

    # Apply label encoding to categorical columns
    for column in non_zero_variance_cols:
        cardinality = len(np.unique(x_train[column]))
        if cardinality > 2:  # Column is categorical
            mapper = lambda x: sum([ord(digit) for digit in str(x)])
            x_train[column] = x_train[column].apply(mapper)
            x_test[column] = x_test[column].apply(mapper)

# Display the modified datasets
print("Updated training set:")
print(x_train.head())

print("\nUpdated test set:")
print(x_test.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[column] = x_train[column].apply(mapper)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[column] = x_test[column].apply(mapper)


Updated training set:
   X196  X259  X74  X240  X96  X37  X360  X154  X168  X21  ...  X385  X354  \
0     0     0    1     0    0    1     0     0     0    1  ...     0     1   
1     0     0    1     0    1    1     0     0     0    0  ...     0     0   
2     0     0    1     0    1    1     0     0     0    0  ...     0     1   
3     0     0    1     0    1    1     0     0     0    0  ...     0     0   
4     0     0    1     0    1    1     0     0     0    0  ...     0     0   

   X344  X318  X169  X126  X279  X116  X300  X23  
0     0     0     0     0     0     1     0    0  
1     0     0     0     0     0     0     0    0  
2     0     0     1     0     1     0     0    0  
3     0     0     0     0     1     0     0    0  
4     0     0     0     0     1     0     0    0  

[5 rows x 364 columns]

Updated test set:
   X196  X259  X74  X240  X96  X37  X360  X154  X168  X21  ...  X385  X354  \
0     0     0    1     0    1    1     0     0     0    0  ...     0     0   
1   

#Confirm data is now numerical

In [9]:
print('Feature types:')
print(x_train[non_zero_variance_cols].dtypes.value_counts())

Feature types:
int64    364
dtype: int64


#Perform dimensionality reduction

In [10]:
n_comp = 12
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(x_train)
pca2_results_test = pca.transform(x_test)

#Training using xgboost

In [11]:
x_train, x_valid, y_train, y_valid = train_test_split(
    pca2_results_train, y_train, test_size=0.2, random_state=4242)
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
d_test = xgb.DMatrix(pca2_results_test)

params = {'objective': 'reg:linear', 'eta': 0.02, 'max_depth': 4}

def xgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

clf = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50,
                feval=xgb_r2_score, maximize=True, verbose_eval=10)

[0]	train-rmse:12.78419	train-r2:0.01323	valid-rmse:11.78154	valid-r2:0.01330
[10]	train-rmse:12.06996	train-r2:0.12040	valid-rmse:10.91823	valid-r2:0.15260




[20]	train-rmse:11.54919	train-r2:0.19467	valid-rmse:10.30543	valid-r2:0.24506
[30]	train-rmse:11.15053	train-r2:0.24931	valid-rmse:9.85657	valid-r2:0.30939
[40]	train-rmse:10.85069	train-r2:0.28914	valid-rmse:9.54286	valid-r2:0.35265
[50]	train-rmse:10.61450	train-r2:0.31975	valid-rmse:9.31130	valid-r2:0.38368
[60]	train-rmse:10.42328	train-r2:0.34404	valid-rmse:9.12627	valid-r2:0.40794
[70]	train-rmse:10.18339	train-r2:0.37388	valid-rmse:8.94679	valid-r2:0.43100
[80]	train-rmse:9.98003	train-r2:0.39864	valid-rmse:8.80931	valid-r2:0.44835
[90]	train-rmse:9.80387	train-r2:0.41968	valid-rmse:8.70424	valid-r2:0.46143
[100]	train-rmse:9.66411	train-r2:0.43611	valid-rmse:8.62534	valid-r2:0.47115
[110]	train-rmse:9.53922	train-r2:0.45059	valid-rmse:8.55633	valid-r2:0.47958
[120]	train-rmse:9.43250	train-r2:0.46281	valid-rmse:8.50798	valid-r2:0.48544
[130]	train-rmse:9.33805	train-r2:0.47352	valid-rmse:8.47704	valid-r2:0.48918
[140]	train-rmse:9.25453	train-r2:0.48289	valid-rmse:8.44551	vali

#Predict test values using xgboost

In [12]:
p_test = clf.predict(d_test)

sub = pd.DataFrame({'ID': id_test, 'y': p_test})
sub.to_csv('xgb.csv', index=False)
sub.head()

Unnamed: 0,ID,y
0,1,82.513329
1,2,95.007858
2,3,81.978683
3,4,77.199409
4,5,113.002701
