## Step 1: Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
import xgboost as xgb

## Step 2: Read the Data from train.csv

In [2]:
train_df = pd.read_csv(r"C:\Users\sapan\Downloads\train.csv")
print(train_df.head())

   ID       y  X0 X1  X2 X3 X4 X5 X6 X8  ...  X375  X376  X377  X378  X379  \
0   0  130.81   k  v  at  a  d  u  j  o  ...     0     0     1     0     0   
1   6   88.53   k  t  av  e  d  y  l  o  ...     1     0     0     0     0   
2   7   76.26  az  w   n  c  d  x  j  x  ...     0     0     0     0     0   
3   9   80.62  az  t   n  f  d  x  l  e  ...     0     0     0     0     0   
4  13   78.02  az  v   n  f  d  h  d  n  ...     0     0     0     0     0   

   X380  X382  X383  X384  X385  
0     0     0     0     0     0  
1     0     0     0     0     0  
2     0     1     0     0     0  
3     0     0     0     0     0  
4     0     0     0     0     0  

[5 rows x 378 columns]


## Step 3: Separate the Target Variable

In [3]:
# Separating X and y from the dataframe
y = train_df['y'].values
X = train_df.drop(['ID', 'y'], axis=1)

## Step 4: Understand Data Types

In [4]:
print(X.dtypes)

X0      object
X1      object
X2      object
X3      object
X4      object
         ...  
X380     int64
X382     int64
X383     int64
X384     int64
X385     int64
Length: 376, dtype: object


## Step 5: Count Data in Columns

In [5]:
print(X.apply(pd.Series.nunique))

X0      47
X1      27
X2      44
X3       7
X4       4
        ..
X380     2
X382     2
X383     2
X384     2
X385     2
Length: 376, dtype: int64


## Step 6: Read test.csv Data

In [6]:
test_df = pd.read_csv(r"C:\Users\sapan\Downloads\test.csv")
# Dropping the ID column from the test dataframe
test_df = test_df.drop(['ID'], axis=1)

## Step 7: Check for Null Values

In [7]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

ID      0
y       0
X0      0
X1      0
X2      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 378, dtype: int64
X0      0
X1      0
X2      0
X3      0
X4      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 376, dtype: int64


## Step 8: Remove Zero-Variance Columns and Apply Label Encoder

In [8]:
# Combining train and test data for preprocessing
train_labels = train_df['y']
train_df.drop(['y'], axis=1, inplace=True)
df = pd.concat([train_df, test_df])

# Apply label encoding and remove zero-variance features on combined data
for column in df.columns:
    if df[column].dtype == type(object):
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column].astype(str))

df = df.loc[:, df.var() != 0]

# Split the combined dataset back into train and test
train_df = df.iloc[:train_df.shape[0]]
test_df = df.iloc[train_df.shape[0]:]

# Add the target variable back to train_df
train_df['y'] = train_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['y'] = train_labels


## Step 9: Checking if the data is in numerical format

In [9]:
print(train_df.dtypes)
print(test_df.dtypes)

ID      float64
X0        int32
X1        int32
X2        int32
X3        int32
         ...   
X382      int64
X383      int64
X384      int64
X385      int64
y       float64
Length: 378, dtype: object
ID      float64
X0        int32
X1        int32
X2        int32
X3        int32
         ...   
X380      int64
X382      int64
X383      int64
X384      int64
X385      int64
Length: 377, dtype: object


## Step 10: Dimensionality Reduction using SVD

In [10]:
svd = TruncatedSVD(n_components=20, random_state=42)
X_dim_red = svd.fit_transform(train_df.drop(['ID', 'y'], axis=1))
test_dim_red = svd.transform(test_df.drop(['ID'], axis=1))

## Step 11: Train the Model using XGBoost

In [11]:
model = xgb.XGBRegressor(objective ='reg:squarederror')
model.fit(X_dim_red, y)

## Step 12: Predict test_df Values

In [12]:
predictions = model.predict(test_dim_red)

## Step 13: Evaluate the performance

In [14]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the training set
train_predictions = model.predict(X_dim_red)

# Calculate MSE and R² Score
mse = mean_squared_error(train_df['y'], train_predictions)
r2 = r2_score(train_df['y'], train_predictions)

print("Mean Squared Error on Training Set:", mse)
print("R² Score on Training Set:", r2)

Mean Squared Error on Training Set: 9.239454067854
R² Score on Training Set: 0.9425151527087171


## Hence, R² = 0.94 indicates a high level of correlation between our model's predictions and the actual values.