<a href="https://colab.research.google.com/github/punam-gwachha/logistic-regression-RanndomForest-SVM/blob/main/Task2_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import required Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, confusion_matrix


#Load training and testing datasets

In [2]:
# Load your  Training dataset
train_data = pd.read_csv('/content/train_set.csv')
train_data.head()

# Load your  Testing dataset
test_data = pd.read_csv('/content/test_set.csv')

# Load your blind test set dataset
blind_test_data = pd.read_csv('/content/blinded_test_set.csv')


# preprocessing and Feature Engineering


In [3]:
#1. Separate features and target i.e CLASS for training set
#and also non informative colum i.e. ID
X_train = train_data.drop(columns=['CLASS','ID'])
y_train = train_data['CLASS']

# Separate features and target for testing set
X_test = test_data.drop(columns=['CLASS','ID'])
y_test = test_data['CLASS']

# X_train.head()

In [4]:
# 2. check wherther there is any missing value or not.
# 0 i.e there is no any missing value
X_train.isnull().sum()

# # check for infinite values of train dataset
np.isinf(X_train).sum()

# check for infinite and missing values for test datset
X_test.isnull().sum()
np.isinf(X_test).sum()

Unnamed: 0,0
Feature_1,0
Feature_2,0
Feature_3,0
Feature_4,0
Feature_5,0
...,...
Feature_3234,0
Feature_3235,0
Feature_3236,0
Feature_3237,0


fill the infinite value and NAN values

In [5]:
# Even though the above code checks for missing values it's possible that this check didn't identify all forms of missing data
# or that missing values were introduced or not handled correctly during subsequent steps.
# Clean train data: replace infs(infinite value)  and fill NaNs(Not a number)
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(X_train.mean(), inplace=True)

# Clean and prepare test set
X_test.replace([np.inf, -np.inf], np.nan)
X_test.fillna(X_test.mean(), inplace=True)


In [6]:
# 3. Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train )

# Transform test data using the same scaler
X_test_scaled = scaler.transform(X_test)

# 4. Convert scaled data back to DataFrame
X_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

Feature selection

In [7]:
# Step 1: Select top K features using ANOVA F-score
k = 2500
selector = SelectKBest(score_func=f_classif, k=k)
# Use X_train and y_train for feature selection
X_selected = selector.fit_transform(X_train_scaled, y_train) # Apply feature selection on scaled training data

# Step 2: Get the selected feature names
selected_mask = selector.get_support()  # Boolean mask
# Get feature names from the original scaled training features
# Note: We need to get column names from the scaled DataFrame
selected_features = X_scaled_df.columns[selected_mask]

# Step 3: Create new DataFrame with selected features for TRAIN
X_selected_df_train = pd.DataFrame(X_selected, columns=selected_features)

# Step 4: Apply the same feature selection to the SCALED TEST data
X_test_selected = selector.transform(X_test_scaled) # Use the *same* selector fitted on training data
X_selected_df_test = pd.DataFrame(X_test_selected, columns=selected_features)

X_selected_df_train.head()

 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933
 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122
 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2248 2249 2255
 2259 2376 2377 2383 2387 2504 2505 2511 2515 2632 2633 2639 2640 2643
 2758 2759 2760 2761 2762 2763 2764 2765 2767 2769 2771 2772 2773 2886
 2887 2888 2889 2890 2891 2892 2893 2895 2897 2899 2900 2901 2937 2938
 2939 2940 2941 2977 2978 2979 2980 2981 3096 3097 3103 3106 3224 3225
 3231 3234] are constant.
  f = msb / msw


Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_3224,Feature_3227,Feature_3228,Feature_3229,Feature_3230,Feature_3231,Feature_3233,Feature_3234,Feature_3237,Feature_3238
0,-0.756859,-0.756281,-0.768458,1.790937,-0.703377,-0.622357,0.75783,-0.722168,0.75783,0.038485,...,-1.903318,1.608494,1.608494,-1.903318,-1.903318,-0.795461,-0.38625,-1.156615,1.487482,-1.156615
1,-0.68466,-0.68669,-0.980408,0.279318,0.567037,0.393269,-0.768013,0.617284,-0.768013,-0.059407,...,-0.133632,-0.045529,-0.045529,-0.133632,-0.133632,-0.683039,-0.309647,-0.493571,0.54265,-0.493571
2,-0.381832,-0.380581,-0.235997,0.738313,-0.658969,-0.592319,0.689108,-0.67279,0.689108,-0.062952,...,-0.972867,0.509137,0.509137,-0.972867,-0.972867,-0.396858,-0.308825,-0.995803,1.137855,-0.995803
3,0.231255,0.232764,0.851293,0.408853,-1.186714,-0.927422,1.599928,-1.272312,1.599928,-0.063502,...,-0.486524,0.354945,0.354945,-0.486524,-0.486524,0.28339,-0.154057,-0.7636,0.852824,-0.7636
4,0.203521,0.203601,0.171085,-0.825213,0.093786,-0.031715,-0.308667,0.138226,-0.308667,-0.058745,...,0.680798,-0.574604,-0.574604,0.680798,0.680798,0.173152,0.084921,0.703383,-0.859371,0.703383



# 1.Logistic Regression with Hyperparameter Tuning


In [8]:
parameter = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l2'],
  }

log_reg = LogisticRegression()
# Fit the GridSearchCV on the feature-selected and scaled training data
grid_search = GridSearchCV(log_reg, parameter, cv=5, scoring='roc_auc')
grid_search.fit(X_selected_df_train, y_train) # Fit on feature-selected TRAIN data

# best_model gives LogisticRegression model with the best hyperparameters found from the grid search
best_model = grid_search.best_estimator_

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


# Predict and Evaluate for logistic regression


In [9]:
# Use the feature-selected and scaled test data for prediction
y_test_pred = best_model.predict(X_selected_df_test)
y_test_proba = best_model.predict_proba(X_selected_df_test)[:, 1]

# Confusion Matrix for Specificity Calculation
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()

# Metrics calculation and Display metrics
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"AUROC:{roc_auc_score(y_test, y_test_proba):.4f}")
print(f"Sensitivity (Recall):{recall_score(y_test, y_test_pred):.4f}")
print(f"specificity:{ tn / (tn + fp):.4f}")
print(f"f1: {f1_score(y_test, y_test_pred):.4f}")

Accuracy: 0.6300
AUROC:0.6576
Sensitivity (Recall):0.4762
specificity:0.7414
f1: 0.5195


# Blinded test for logistic regression

In [10]:
# Clean the blind test data first
X_blind = blind_test_data.drop(columns=["ID"]).replace([np.inf, -np.inf], np.nan)
X_blind.fillna(X_blind.mean(), inplace=True)

# Apply the same scaler fitted on the training data to the blind test data
X_blind_scaled = scaler.transform(X_blind)

# Convert scaled blind data back to DataFrame to retain feature names
X_blind_scaled_df = pd.DataFrame(X_blind_scaled, columns=X_blind.columns)

# Use the selected_features list to ensure correct column order and names
X_blind_selected_df = X_blind_scaled_df[selected_features]

# Now predict using the feature-selected blind data
blind_probs = best_model.predict_proba(X_blind_selected_df)

# Save submission
submission = pd.DataFrame({
    # Use the original blind_test_data for ID as X_blind_selected_df might have a different index
    "ID": blind_test_data["ID"],
    "class_0": blind_probs[:, 0],
    "class_1": blind_probs[:, 1]
})
submission.to_csv("logreg_blinded_predictions.csv", index=False)

#2.Random Forest

In [11]:
#  Random Forest model implementation
rf = RandomForestClassifier(random_state=42)

# Define parameter grid for GridSearch
param_rf = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20]
}

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(rf, param_rf, cv=5, scoring="roc_auc")
grid_search.fit(X_selected_df_train, y_train)

# Best model
best_rf= grid_search.best_estimator_

# Predict on the scaled and feature-selected test data
rf_pred = best_rf.predict(X_selected_df_test)
rf_proba = best_rf.predict_proba(X_selected_df_test)[:, 1]

# Metrics calculation and Display metrics
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_pred):.4f}")
print(f"Random Forest AUROC:{roc_auc_score(y_test, rf_proba):.4f}")
print(f"Random Forest Sensitivity (Recall):{recall_score(y_test, rf_pred):.4f}")
print(f"Random Forest specificity:{ tn / (tn + fp):.4f}")
print(f"Random Forest f1: {f1_score(y_test, rf_pred):.4f}")

Random Forest Accuracy: 0.6300
Random Forest AUROC:0.6539
Random Forest Sensitivity (Recall):0.3333
Random Forest specificity:0.7414
Random Forest f1: 0.4308


# Blinded test for random forest

In [12]:
# Now predict using the feature-selected blind data
blind_probs = best_rf.predict_proba(X_blind_selected_df)

# Save submission
submission = pd.DataFrame({
    # Use the original blind_test_data for ID as X_blind_selected_df might have a different index
    "ID": blind_test_data["ID"],
    "class_0": blind_probs[:, 0],
    "class_1": blind_probs[:, 1]
})
submission.to_csv("randomforest_blinded_predictions.csv", index=False)

#3. Support Vector machine

In [13]:
svm = SVC(kernel='rbf', probability=True, class_weight="balanced", random_state=42)

# Define parameter grid for GridSearch
param_svm = {
    "C": [0.1, 1, 10],
    "gamma": ['scale', 0.01, 0.001]
}

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(svm, param_svm, cv=5, scoring="roc_auc")
grid_search.fit(X_selected_df_train, y_train)

# Best model
best_svm = grid_search.best_estimator_

# Predict on the scaled and feature-selected test data
svm_pred = best_svm.predict(X_selected_df_test)
svm_proba = best_svm.predict_proba(X_selected_df_test)[:, 1]

# Metrics calculation and Display metrics
print(f"svm Accuracy: {accuracy_score(y_test, svm_pred):.4f}")
print(f"svm AUROC:{roc_auc_score(y_test, svm_proba ):.4f}")
print(f"svm Sensitivity (Recall):{recall_score(y_test, svm_pred):.4f}")
print(f"svm specificity:{ tn / (tn + fp):.4f}")
print(f"svm f1: {f1_score(y_test,svm_pred):.4f}")

svm Accuracy: 0.6100
svm AUROC:0.3087
svm Sensitivity (Recall):0.8810
svm specificity:0.7414
svm f1: 0.6549


#Blind test for svm

In [14]:
# Predict on blinded test set
svm_blind_probs = best_svm.predict_proba(X_blind_selected_df)
svm_output = pd.DataFrame({
    "ID": blind_test_data["ID"],
    "class_0": svm_blind_probs[:, 0],
    "class_1": svm_blind_probs[:, 1]
})
svm_output.to_csv("svm_blinded_predictions.csv", index=False)
