In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import accuracy_score
from itertools import combinations

In [22]:
df = pd.read_csv('EV_train.csv')

In [23]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])

In [24]:
X = df.drop(["BuyEV", "state_precip", "state_high_temp", "race", "license"], axis=1)
y = df['BuyEV']

numerical_cols_updated = X.select_dtypes(include=['int64', 'float64']).columns


In [25]:

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

categorical_cols = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols_updated),
        ('cat', categorical_transformer, categorical_cols)])

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
dtrain = xgb.DMatrix(X_train_transformed, label=y_train)
dtest = xgb.DMatrix(X_test_transformed, label=y_test)

In [27]:
params = {
    
    'reg_lambda':0,
    'objective': 'binary:logistic',
    'max_depth': 6,
    'learning_rate': 0.01,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'logloss',
    'seed': 42,

}

# Train the model with cross-validation
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=1100,
    nfold=5,
    early_stopping_rounds=50,
    metrics='logloss',
    seed=42
)

print(f"Best logloss: {cv_results['test-logloss-mean'].min()}, at round: {cv_results['test-logloss-mean'].idxmin()}")


Best logloss: 0.5133258093215258, at round: 1050


In [28]:
final_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=cv_results['test-logloss-mean'].idxmin()
)

preds_prob = final_model.predict(dtest)
predictions = [1 if i > 0.5 else 0 for i in preds_prob]
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 79.03%


In [29]:
test_df = pd.read_csv('EV_X_test.csv')
X_test_final = test_df.drop(["state_precip", "state_high_temp", "race", "license"], axis=1)

X_test_final.head()
 
X_test_final_transformed = preprocessor.transform(X_test_final)

dtest_final = xgb.DMatrix(X_test_final_transformed)


final_preds_prob = final_model.predict(dtest_final)

final_predictions = [1 if i > 0.5 else 0 for i in final_preds_prob]

predictions_df = pd.DataFrame(final_predictions, columns=['BuyEV_Prediction'])

predictions_df.to_csv('EV_test_pred_qc7205.csv', index=False, header=False)

print("Predictions saved to EV_test_pred_qc7205.csv")

Predictions saved to EV_test_pred_qc7205.csv
