In [2]:
pip install -U imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [4]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, fbeta_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

In [8]:
test_data = pd.read_csv(r"D:\PDFs\Edvancer Eduventures\Certified Machine Learning Expert\Projects\Marketing\carvan_test.csv") 
train_data = pd.read_csv(r"D:\PDFs\Edvancer Eduventures\Certified Machine Learning Expert\Projects\Marketing\carvan_train.csv")

In [10]:
X = train_data.drop(columns=['V86'])
y = train_data['V86']

In [12]:
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)
test_data = imputer.transform(test_data)

In [14]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
test_data = scaler.transform(test_data) 

In [16]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

In [18]:
X_train, X_val, y_train, y_val = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

In [20]:
rf_model = RandomForestClassifier(random_state=42) 

In [22]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [24]:
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2) 

In [26]:
grid_search.fit(X_train, y_train) 

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [28]:
best_rf_model = grid_search.best_estimator_ 

In [30]:
y_val_pred_proba = best_rf_model.predict_proba(X_val)[:, 1]
threshold = 0.3  # Adjusting the threshold
y_val_pred = (y_val_pred_proba >= threshold).astype(int)

In [32]:
# List of thresholds to evaluate
thresholds = np.arange(0.1, 0.9, 0.05)

In [34]:
# Initialize lists to store metrics
accuracy_scores = []
f1_scores = []
fbeta_scores = []
roc_auc_scores = []

# Evaluate the model at each threshold
for threshold in thresholds:
    # Generate predictions based on the current threshold
    y_val_pred = (y_val_pred_proba >= threshold).astype(int)
    
    # Calculate accuracy, F1-score, ROC AUC, and F-beta score
    accuracy = accuracy_score(y_val, y_val_pred)
    f1 = fbeta_score(y_val, y_val_pred, beta=1, zero_division=0)  # F1-score (beta=1)
    fbeta = fbeta_score(y_val, y_val_pred, beta=0.5, zero_division=0)  # Adjust beta as needed
    roc_auc = roc_auc_score(y_val, y_val_pred_proba)
    
    # Append metrics to respective lists
    accuracy_scores.append(accuracy)
    f1_scores.append(f1)
    fbeta_scores.append(fbeta)
    roc_auc_scores.append(roc_auc)

# Find the threshold that maximizes the F1-score (or F-beta score if preferred)
optimal_threshold_index = np.argmax(f1_scores)  # You can change this to fbeta_scores or another metric
optimal_threshold = thresholds[optimal_threshold_index]

In [36]:
print(f"Optimal Threshold (based on F1-score): {optimal_threshold}")
print(f"Validation Accuracy at Optimal Threshold: {accuracy_scores[optimal_threshold_index]}")
print(f"Validation F1-score at Optimal Threshold: {f1_scores[optimal_threshold_index]}")
print(f"Validation ROC AUC Score at Optimal Threshold: {roc_auc_scores[optimal_threshold_index]}")
print(f"Validation F-beta Score (beta=0.5) at Optimal Threshold: {fbeta_scores[optimal_threshold_index]}")

Optimal Threshold (based on F1-score): 0.5500000000000002
Validation Accuracy at Optimal Threshold: 0.9593607305936073
Validation F1-score at Optimal Threshold: 0.9580386610089581
Validation ROC AUC Score at Optimal Threshold: 0.9869753889198003
Validation F-beta Score (beta=0.5) at Optimal Threshold: 0.9681722889270059


In [38]:
y_test_pred_proba = best_rf_model.predict_proba(test_data)[:, 1]

In [40]:
# Predict on the test set using the optimal threshold
y_test_pred = (y_test_pred_proba >= optimal_threshold).astype(int)

In [42]:
# Save the predictions to a CSV file
submission = pd.DataFrame({'V86': y_test_pred})
submission.to_csv('submission.csv', index=False)