In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import randint

In [8]:
# Load datasets
train_path = './data/train.csv'
test_path = './data/test.csv'
sample_submission_path = './data/sample_submission.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)
sample_submission_df = pd.read_csv(sample_submission_path)

In [9]:
# Fill missing values in 'PER' and 'ROE' columns with the median value of each column
train_df['PER'].fillna(train_df['PER'].median(), inplace=True)
train_df['ROE'].fillna(train_df['ROE'].median(), inplace=True)
test_df['PER'].fillna(test_df['PER'].median(), inplace=True)
test_df['ROE'].fillna(test_df['ROE'].median(), inplace=True)

# Features and target variable
X = train_df.drop(columns=['id', 'label'])
y = train_df['label']

# Standardize the feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_df_scaled = scaler.transform(test_df.drop(columns=['id']))

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the models with RandomizedSearchCV for hyperparameter tuning
rf_model = RandomForestClassifier(random_state=42)
rf_param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 11)
}

rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=rf_param_dist, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit the RandomizedSearchCV model
rf_random.fit(X_train, y_train)

# Extract the best model from RandomizedSearchCV
best_rf_model = rf_random.best_estimator_

# Initialize and fit GradientBoostingClassifier
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)

# Initialize and fit ExtraTreesClassifier
et_model = ExtraTreesClassifier(n_estimators=200, max_depth=20, random_state=42)
et_model.fit(X_train, y_train)

# Create a VotingClassifier with the best model from RandomizedSearchCV, GradientBoosting, and ExtraTrees
voting_model = VotingClassifier(estimators=[('rf', best_rf_model), ('gb', gb_model), ('et', et_model)], voting='hard')

# Train the VotingClassifier
voting_model.fit(X_train, y_train)

# Validate the model
y_pred = voting_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")

# Make predictions on the test set
test_predictions = voting_model.predict(test_df_scaled)

# Prepare the submission file
sample_submission['label'] = test_predictions
sample_submission.to_csv('submission.csv', index=False)

print("Submission file has been created.")

Fitting 3 folds for each of 100 candidates, totalling 300 fits


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['PER'].fillna(train_df['PER'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['ROE'].fillna(train_df['ROE'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate objec

[CV] END max_depth=30, min_samples_leaf=8, min_samples_split=6, n_estimators=202; total time=   0.6s
[CV] END max_depth=20, min_samples_leaf=8, min_samples_split=6, n_estimators=199; total time=   0.6s
[CV] END max_depth=30, min_samples_leaf=8, min_samples_split=6, n_estimators=202; total time=   0.6s
[CV] END max_depth=20, min_samples_leaf=8, min_samples_split=6, n_estimators=199; total time=   0.6s
[CV] END max_depth=30, min_samples_leaf=8, min_samples_split=6, n_estimators=202; total time=   0.6s
[CV] END max_depth=20, min_samples_leaf=8, min_samples_split=6, n_estimators=199; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=3, min_samples_split=8, n_estimators=430; total time=   1.2s
[CV] END max_depth=10, min_samples_leaf=3, min_samples_split=8, n_estimators=430; total time=   1.3s
[CV] END max_depth=10, min_samples_leaf=3, min_samples_split=8, n_estimators=430; total time=   1.3s
[CV] END max_depth=30, min_samples_leaf=6, min_samples_split=3, n_estimators=291; total tim