In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# Impute missing values and label encoding for categorical features
def preprocess_data(df, is_train=True):
    # Impute missing values for numeric columns
    imputer = SimpleImputer(strategy='mean')
    df_numeric = df.select_dtypes(include=['float64', 'int64']).copy()
    df_numeric = pd.DataFrame(imputer.fit_transform(df_numeric), columns=df_numeric.columns)
    
    # Fill missing values for categorical features
    df_categorical = df.select_dtypes(include=['object']).copy()
    df_categorical = df_categorical.fillna('Unknown')

    # Label encoding for categorical variables
    label_encoders = {}
    for column in df_categorical.columns:
        label_encoders[column] = LabelEncoder()
        df_categorical[column] = label_encoders[column].fit_transform(df_categorical[column])
    
    # Combine numeric and categorical features
    df_preprocessed = pd.concat([df_numeric, df_categorical], axis=1)

    return df_preprocessed

# Load the datasets
train_data = pd.read_csv('Train_Dataset.csv', encoding='ISO-8859-1')
test_data = pd.read_csv('Test_Dataset.csv', encoding='ISO-8859-1')

# Preprocess the train and test data
X_train = preprocess_data(train_data.drop(columns=['Attrition', 'EmployeeID']))
y_train = train_data['Attrition']
X_test = preprocess_data(test_data.drop(columns=['EmployeeID']))

# Drop any rows where the target variable is NaN
train_data_cleaned = train_data.dropna(subset=['Attrition'])
X_train_cleaned = preprocess_data(train_data_cleaned.drop(columns=['Attrition', 'EmployeeID']))
y_train_cleaned = train_data_cleaned['Attrition']

# Scale the data
scaler = StandardScaler()
X_train_scaled_cleaned = scaler.fit_transform(X_train_cleaned)
X_test_scaled = scaler.transform(X_test)

# Model setup with max_depth for Gradient Boosting Classifier
log_clf = LogisticRegression(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42, max_depth=10)
xgb_clf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Combine models in a Voting Classifier
voting_clf_advanced = VotingClassifier(estimators=[
    ('lr', log_clf),
    ('rf', rf_clf),
    ('gb', gb_clf),  # Updated GBC with max_depth=10
    ('xgb', xgb_clf)
], voting='soft')

# Optimized parameter grid including max_depth for GBC
param_grid_advanced = {
    'rf__n_estimators': [100],
    'gb__learning_rate': [0.1],
    'gb__n_estimators': [100],
    'xgb__learning_rate': [0.1],
    'xgb__n_estimators': [100],
    'xgb__max_depth': [5]
}

# Perform Grid Search with the updated parameter grid
grid_search_faster = GridSearchCV(voting_clf_advanced, param_grid_advanced, cv=5, scoring='accuracy')
grid_search_faster.fit(X_train_scaled_cleaned, y_train_cleaned)

# Evaluate the model
best_model_faster = grid_search_faster.best_estimator_
y_train_pred_faster = best_model_faster.predict(X_train_scaled_cleaned)

# Calculate accuracy on the training set
accuracy_faster = accuracy_score(y_train_cleaned, y_train_pred_faster)
print(f'Optimized Training Accuracy: {accuracy_faster}')

# Use the best model to predict Attrition for the test set and create the submission file
y_test_pred = best_model_faster.predict(X_test_scaled)

# Prepare the submission DataFrame
submission_df = pd.DataFrame({
    'EmployeeID': test_data['EmployeeID'],
    'Attrition': y_test_pred
})

# Ensure the file has exactly 2630 entries
submission_df_final = submission_df.head(2630)



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Optimized Training Accuracy: 0.9998069498069498


TypeError: can only concatenate str (not "float") to str

In [7]:
# Save the DataFrame to a CSV file
submission_df_final.to_csv("submission_v19.csv", index=False)