In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import joblib

# Load dataset
data = pd.read_csv('training_data.csv')
X = data.drop(columns=['Label'])
y = LabelEncoder().fit_transform(data['Label'])  # Encode labels as 0 and 1

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary of algorithms to train
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

# Train each model, evaluate accuracy, and save all models
best_model = None
best_accuracy = 0
results = {}

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    
    print(f"{name} Accuracy: {accuracy:.4f}")
    
    # Save each model
    model_filename = f"{name.replace(' ', '_')}_model.joblib"
    joblib.dump(model, model_filename)
    print(f"Model saved as {model_filename}")
    
    # Track the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Save the best model separately for clarity
joblib.dump(best_model, 'best_model.joblib')
print(f"\nBest Model: {best_model.__class__.__name__} with Accuracy: {best_accuracy:.4f}")


Logistic Regression Accuracy: 0.7550
Model saved as Logistic_Regression_model.joblib
Random Forest Accuracy: 0.7450
Model saved as Random_Forest_model.joblib
SVM Accuracy: 0.7550
Model saved as SVM_model.joblib
K-Nearest Neighbors Accuracy: 0.7250
Model saved as K-Nearest_Neighbors_model.joblib
Gradient Boosting Accuracy: 0.7300
Model saved as Gradient_Boosting_model.joblib
Decision Tree Accuracy: 0.6500
Model saved as Decision_Tree_model.joblib

Best Model: LogisticRegression with Accuracy: 0.7550


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import joblib

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train Logistic Regression with class weights
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_resampled, y_resampled)

# Save the re-trained model
joblib.dump(model, 'balanced_logistic_regression_model.joblib')

# Evaluate on the test data
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6
              precision    recall  f1-score   support

           0       0.31      0.53      0.39        49
           1       0.80      0.62      0.70       151

    accuracy                           0.60       200
   macro avg       0.56      0.58      0.55       200
weighted avg       0.68      0.60      0.63       200



In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV

# Define models and parameters for tuning
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier()
}

params = {
    'Logistic Regression': {'C': [0.1, 1, 10]},
    'Random Forest': {'n_estimators': [50, 100, 150], 'max_depth': [5, 10, None]},
    'Gradient Boosting': {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.5]}
}

# Tune each model with GridSearchCV
best_models = {}
for model_name in models:
    grid = GridSearchCV(models[model_name], params[model_name], scoring='f1_weighted', cv=5)
    grid.fit(X_resampled, y_resampled)
    best_models[model_name] = grid.best_estimator_
    print(f"{model_name} best parameters: {grid.best_params_}")

# Create an ensemble using the best tuned models
ensemble = VotingClassifier(estimators=[
    ('lr', best_models['Logistic Regression']),
    ('rf', best_models['Random Forest']),
    ('gb', best_models['Gradient Boosting'])
], voting='soft')

# Train and evaluate the ensemble model
ensemble.fit(X_resampled, y_resampled)
y_pred = ensemble.predict(X_test)

print("Ensemble Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Logistic Regression best parameters: {'C': 1}
Random Forest best parameters: {'max_depth': None, 'n_estimators': 100}
Gradient Boosting best parameters: {'learning_rate': 0.5, 'n_estimators': 150}
Ensemble Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        49
           1       1.00      1.00      1.00       151

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [16]:
import joblib

# Save the ensemble model
joblib.dump(ensemble, 'ensemble_model.joblib')


['ensemble_model.joblib']

In [19]:
import joblib
import pandas as pd

# Load the saved ensemble model
ensemble = joblib.load('ensemble_model.joblib')

# New test data
data = [
    [3169, 84.6247798635863, 41.67349939111806, 0.6800232494089303, 47.40725543010246, 58.96466103059585, 35.933455880744816, 0.9638054077939433],
    [1466, 61.217332224917286, 39.97989646458853, 0.8378180244862548, 8.366954233773786, 77.61810607172299, 68.40641417892604, 0.7085416001081007],
    [2238, 15.71606466310012, 67.04110544477845, 0.40006658994319944, 24.730552186276018, 88.50369293824137, 35.10840596305289, 0.916975720217114],
    [1330, 13.313968087239225, 61.38508347490708, 0.759746070873994, 28.844656748983677, 76.69878516339955, 69.42478327704802, 0.7567934604163131]
]

# Convert to a DataFrame (update column names to match training feature names)
columns = ['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5', 'Feature6', 'Feature7', 'Feature8']
df_test = pd.DataFrame(data, columns=columns)

# Predict using the ensemble model
y_pred = ensemble.predict(df_test)

# Print the predictions
for i, pred in enumerate(y_pred):
    print(f"Sample {i+1} Prediction: {pred}")


Sample 1 Prediction: 0
Sample 2 Prediction: 0
Sample 3 Prediction: 1
Sample 4 Prediction: 1
