# Importing Libraries

In [1]:
pip install -U imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.12.3
    Uninstalling imbalanced-learn-0.12.3:
      Successfully uninstalled imbalanced-learn-0.12.3
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, fbeta_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load Datasets

In [3]:
test_data = pd.read_csv(r"D:\PDFs\Edvancer Eduventures\Certified Machine Learning Expert\Projects\Marketing\carvan_test.csv") 
train_data = pd.read_csv(r"D:\PDFs\Edvancer Eduventures\Certified Machine Learning Expert\Projects\Marketing\carvan_train.csv")

# Data Preprocessing

In [4]:
X = train_data.drop(columns=['V86'])

In [5]:
y = train_data['V86']

# Handling Missing Values

In [6]:
imputer = SimpleImputer(strategy='mean')

In [7]:
X = imputer.fit_transform(X)

In [8]:
test_data = imputer.transform(test_data)

# Feature Scaling

In [9]:
scaler = StandardScaler()

In [10]:
X = scaler.fit_transform(X)

In [11]:
test_data = scaler.transform(test_data) 

In [12]:
test_data

array([[ 0.68090623, -0.27257995,  1.67289335, ..., -0.15062046,
        -0.08734772, -0.11881647],
       [-1.42098041, -0.27257995,  0.40669664, ..., -0.15062046,
        -0.08734772, -0.11881647],
       [ 1.14799215, -0.27257995,  0.40669664, ..., -0.15062046,
        -0.08734772, -0.11881647],
       ...,
       [ 0.91444919, -0.27257995, -0.85950007, ..., -0.15062046,
        10.96783552, -0.11881647],
       [ 0.68090623, -0.27257995,  0.40669664, ..., -0.15062046,
        -0.08734772, -0.11881647],
       [-1.26528511, -0.27257995, -0.85950007, ..., -0.15062046,
        -0.08734772, -0.11881647]])

# Handle Imbalanced Dataset using SMOTE

In [13]:
smote = SMOTE(random_state=42)

In [14]:
X_res, y_res = smote.fit_resample(X, y)

# Train-Test Split

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Applying Random Trees Algorithm

In [16]:
rf_model = RandomForestClassifier(random_state=42) 

In [17]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [18]:
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2) 

In [19]:
grid_search.fit(X_train, y_train) 

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [20]:
best_rf_model = grid_search.best_estimator_ 

In [23]:
y_val_pred_proba = best_rf_model.predict_proba(X_val)[:, 1]

In [24]:
threshold = 0.3  # Adjusting the threshold
y_val_pred = (y_val_pred_proba >= threshold).astype(int)

In [25]:
# List of thresholds to evaluate
thresholds = np.arange(0.1, 0.9, 0.05)

# Initialize lists to store metrics

In [26]:
accuracy_scores = []
f1_scores = []
fbeta_scores = []
roc_auc_scores = []

In [27]:
for threshold in thresholds:
    y_val_pred = (y_val_pred_proba >= threshold).astype(int)

    # Calculate accuracy, F1-score, ROC AUC, and F-beta score
    accuracy = accuracy_score(y_val, y_val_pred)
    f1 = fbeta_score(y_val, y_val_pred, beta=1, zero_division=0)  # F1-score (beta=1)
    fbeta = fbeta_score(y_val, y_val_pred, beta=0.5, zero_division=0)  # Adjust beta as needed
    roc_auc = roc_auc_score(y_val, y_val_pred_proba)
    
    # Append metrics to respective lists
    accuracy_scores.append(accuracy)
    f1_scores.append(f1)
    fbeta_scores.append(fbeta)
    roc_auc_scores.append(roc_auc)

# Find the threshold that maximizes the F1-score (or F-beta score if preferred)
optimal_threshold_index = np.argmax(f1_scores)  # You can change this to fbeta_scores or another metric
optimal_threshold = thresholds[optimal_threshold_index]

In [28]:
print(f"Optimal Threshold (based on F1-score): {optimal_threshold}")
print(f"Validation Accuracy at Optimal Threshold: {accuracy_scores[optimal_threshold_index]}")
print(f"Validation F1-score at Optimal Threshold: {f1_scores[optimal_threshold_index]}")
print(f"Validation ROC AUC Score at Optimal Threshold: {roc_auc_scores[optimal_threshold_index]}")
print(f"Validation F-beta Score (beta=0.5) at Optimal Threshold: {fbeta_scores[optimal_threshold_index]}")

Optimal Threshold (based on F1-score): 0.5500000000000002
Validation Accuracy at Optimal Threshold: 0.9593607305936073
Validation F1-score at Optimal Threshold: 0.9580386610089581
Validation ROC AUC Score at Optimal Threshold: 0.9869753889198003
Validation F-beta Score (beta=0.5) at Optimal Threshold: 0.9681722889270059


In [29]:
y_test_pred_proba = best_rf_model.predict_proba(test_data)[:, 1]

In [30]:
# Predict on the test set using the optimal threshold
y_test_pred = (y_test_pred_proba >= optimal_threshold).astype(int)

In [None]:
# Save the predictions to a CSV file
submission = pd.DataFrame({'V86': y_test_pred})
submission.to_csv('submission.csv', index=False)