In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

# Load dataset
df = pd.read_csv("Lead-V3.csv")

# Split features and labels
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Feature selection using SelectKBest
best_features = []
best_accuracy = 0
for k in range(1, 1000):
    skb = SelectKBest(score_func=mutual_info_classif, k=k)
    X_new = skb.fit_transform(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)
    
    # Random Forest Classifier
    rf_clf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=2, min_samples_leaf=1, random_state=42)
    rf_clf.fit(X_train, y_train)
    y_pred_rf = rf_clf.predict(X_test)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    if accuracy_rf > best_accuracy:
        best_accuracy = accuracy_rf
        best_features = skb.get_support()
    print(f"Random Forest Classifier with k={k} selected features: {accuracy_rf}")
    
    # Logistic Regression
    lr_clf = LogisticRegression(max_iter=5000, random_state=42)
    lr_clf.fit(X_train, y_train)
    y_pred_lr = lr_clf.predict(X_test)
    accuracy_lr = accuracy_score(y_test, y_pred_lr)
    print(f"Logistic Regression with k={k} selected features: {accuracy_lr}")
    
    # Support Vector Machine
    svm_clf = SVC(kernel='rbf', random_state=42)
    svm_clf.fit(X_train, y_train)
    y_pred_svm = svm_clf.predict(X_test)
    accuracy_svm = accuracy_score(y_test, y_pred_svm)
    print(f"SVM with k={k} selected features: {accuracy_svm}")
    
    # Gradient Boosting Machine
    gbm_clf = GradientBoostingClassifier(n_estimators=100, max_depth=5, min_samples_split=2, min_samples_leaf=1, random_state=42)
    gbm_clf.fit(X_train, y_train)
    y_pred_gbm = gbm_clf.predict(X_test)
    accuracy_gbm = accuracy_score(y_test, y_pred_gbm)
    print(f"GBM with k={k} selected features: {accuracy_gbm}")
    
    print()

# Print best features
print(f"Best Features: {[i for i, x in enumerate(best_features) if x]}")
print('Done Boss !!')

Random Forest Classifier with k=1 selected features: 0.6271186440677966
Logistic Regression with k=1 selected features: 0.635593220338983
SVM with k=1 selected features: 0.635593220338983
GBM with k=1 selected features: 0.6610169491525424

Random Forest Classifier with k=2 selected features: 0.635593220338983
Logistic Regression with k=2 selected features: 0.635593220338983
SVM with k=2 selected features: 0.635593220338983
GBM with k=2 selected features: 0.652542372881356

Random Forest Classifier with k=3 selected features: 0.6271186440677966
Logistic Regression with k=3 selected features: 0.635593220338983
SVM with k=3 selected features: 0.635593220338983
GBM with k=3 selected features: 0.5932203389830508

Random Forest Classifier with k=4 selected features: 0.6440677966101694
Logistic Regression with k=4 selected features: 0.635593220338983
SVM with k=4 selected features: 0.6271186440677966
GBM with k=4 selected features: 0.6271186440677966

Random Forest Classifier with k=5 selecte