### D_3. Logistic classification RFE

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import pickle
from sklearn.metrics import accuracy_score, classification_report

In [2]:
#load the files

X_train = pd.read_csv('../data/x_y_data/re_smote/2X_resampled.csv')
y_train = pd.read_csv('../data/x_y_data/re_smote/2y_resampled.csv')
X_test = pd.read_csv('../data/x_y_data/re_smote/2X_test.csv')
y_test = pd.read_csv('../data/x_y_data/re_smote/2y_test.csv')
data = pd.read_csv('../data/cleaned_data/cleaned_data.csv')

In [3]:
X_train.shape

(8260, 42)

### RFE

In [4]:
from sklearn.feature_selection import RFE
model= LogisticRegression(penalty = 'l2')
num_features_to_select = 35 # Adjust this to the number of features you want to keep
rfe = RFE(model, n_features_to_select=num_features_to_select)
rfe.fit(X_train, y_train)  # X_train should be your feature matrix, y_train should be your target
selected_features = X_train.columns[rfe.support_]
feature_ranking = rfe.ranking_

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Create and fit the model on the selected features
model = LogisticRegression()
model.fit(X_train_selected, y_train)

# Predict using the model on the selected features
y_pred = model.predict(X_test_selected)



from sklearn.metrics import confusion_matrix



# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
display(cm)

Confusion Matrix:


array([[834, 199],
       [128, 246]])

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score



accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')
f1 = f1_score(y_test, y_pred, pos_label='Yes')
kappa = cohen_kappa_score(y_test, y_pred)

# Create a DataFrame
metrics_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1-Score", "Kappa"],
    "Score": [accuracy, precision, recall, f1, kappa]
})

display(metrics_df)

Unnamed: 0,Metric,Score
0,Accuracy,0.767591
1,Precision,0.552809
2,Recall,0.657754
3,F1-Score,0.600733
4,Kappa,0.438555


### Variable Importance 

In [None]:
lg1= LogisticRegression(penalty = 'l2')

lg_tuned = lg1.fit(X_train, y_train)

from sklearn.inspection import permutation_importance


# Calculate permutation feature importances
result = permutation_importance(lg_tuned, X_train, y_train, n_repeats=10, random_state=42)

# Sort and display the results
feature_importances = pd.Series(result.importances_mean, index=X_train.columns).sort_values(ascending=False)

# Plot the feature importances
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8)) 
sns.barplot(x=feature_importances, y=feature_importances.index)
plt.xlabel('Significance Score Of Variables')
plt.ylabel('Variables')
plt.title("Variable Importance for Logistic Regression  Model")
plt.show()


In [None]:
#save the model
lg1.fit(X_train, y_train)

filename = "../models/re_smote_lg.pickle" # Path with filename

with open(filename, "wb") as file:
        pickle.dump(lg1,file)

In [None]:
feature_importances_df = pd.DataFrame(feature_importances)
feature_importances_df.head()

In [None]:
feature_importances_df.T.to_csv('../data/Re_Significant_Score/LG_Significance_Score.csv', index=False)

### the classification report

In [None]:
from sklearn.metrics import confusion_matrix


y_pred = lg1.predict(X_test)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
display(cm)

In [None]:
y_pred_df = pd.DataFrame(y_pred)

y_pred_df.value_counts()

In [None]:
y_test_df = pd.DataFrame(y_test)

y_test_df.value_counts()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score


y_pred = lg1.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')
f1 = f1_score(y_test, y_pred, pos_label='Yes')
kappa = cohen_kappa_score(y_test, y_pred)

# Create a DataFrame
metrics_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1-Score", "Kappa"],
    "Score": [accuracy, precision, recall, f1, kappa]
})

display(metrics_df)

In [None]:
metrics_df.to_csv('../data/metrics/RE_SMOTE_lg_metrics.csv', index=False)