In [1]:
!pip install xgboost



In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (4209, 378)
Test shape: (4209, 377)


In [7]:
# Step 1: Remove Columns with Zero Variance
zero_var_cols = train_df.columns[train_df.nunique() == 1]
print("Zero Variance Columns:", list(zero_var_cols))

train_df.drop(columns=zero_var_cols, inplace=True)
test_df.drop(columns=zero_var_cols, inplace=True)

Zero Variance Columns: ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']


In [9]:
# Step 2: Check for Null Values
print("Train Null Values:\n", train_df.isnull().sum().sum())
print("Test Null Values:\n", test_df.isnull().sum().sum())



Train Null Values:
 0
Test Null Values:
 0


In [11]:
# Step 3: Check Unique Values
print("Train Unique Values:\n", train_df.nunique().sort_values())

Train Unique Values:
 X190       2
X259       2
X258       2
X257       2
X256       2
        ... 
X5        29
X2        44
X0        47
y       2545
ID      4209
Length: 366, dtype: int64


In [13]:
# Step 4: Separate Features and Target
X = train_df.drop(['y'], axis=1)  # Assuming 'y' is the target
y = train_df['y']

# Make sure test_df and X have the same columns
assert set(X.columns) == set(test_df.columns)

In [15]:
# Step 5: Safe Label Encoding for Categorical Features

from sklearn.preprocessing import LabelEncoder

# Combine train and test data for consistent encoding
combined_df = pd.concat([X, test_df], axis=0)

# Loop through categorical columns only
categorical_cols = combined_df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    le = LabelEncoder()
    combined_df[col] = combined_df[col].astype(str)  # Ensure string type
    combined_df[col] = le.fit_transform(combined_df[col])

# Split back to X and test_df
X = combined_df.iloc[:len(X)]
test_df = combined_df.iloc[len(X):]

In [17]:
# Step 6: Dimensionality Reduction with PCA
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X)
test_pca = pca.transform(test_df)

print("Original shape:", X.shape)
print("Reduced shape:", X_pca.shape)

Original shape: (4209, 365)
Reduced shape: (4209, 1)


In [21]:
# Step 7: Train XGBoost Model
from xgboost import XGBRegressor

# Define the model and set hyperparameters
xgb = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    verbosity=1  # Can be set to 0 to suppress output
)

# Fit the model
xgb.fit(X_pca, y)

In [23]:
# Step 8: Make predictions
predictions = xgb.predict(test_pca)

In [25]:
#  Step 9: Predict on Test Data
predictions = xgb.predict(test_pca)

#  Step 10: Export Results
output = pd.DataFrame({
    'ID': test_df['ID'],  # Adjust if test set has an 'ID' column
    'y': predictions
})
output.to_csv('submission.csv', index=False)
print("✅ Submission file saved as 'submission.csv'")

✅ Submission file saved as 'submission.csv'
