# Evaluating ML Models after Hypertunning

In [1]:
import pandas as pd
# File paths
train_file = r"C:\Users\prits\Downloads\Data\clean_ghc_train.csv"
# Load CSV files into pandas DataFrames
train_df = pd.read_csv(train_file)

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler

#Drop rows with NaN in 'text' column
train_df = train_df.dropna(subset=['text'])

# Step 2: Split the data into X (features) and y (labels)
X = train_df['text']
y = train_df['label']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

# Initialize TfidfVectorizer with adjusted parameters
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit and transform X_train (text data) using TfidfVectorizer
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform X_test using the fitted TfidfVectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Balance the training data 
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled= ros.fit_resample(X_train_tfidf, y_train)

# HYPERTUNNING OF DIFFERENT ML MODELS

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

def train_best_model(X_resampled, y_resampled):
    # Define the parameter grids for each model
    param_grids = {
        'LogisticRegression': {
            'model': LogisticRegression(max_iter=1000),
            'params': {
                'C': [0.1, 10],
                'solver': ['liblinear'],
                'penalty': ['l2']
            }
        },
        'SVC': {
            'model': SVC(),
            'params': {
                'C': [0.1, 1, 10],
                'gamma': ['scale', 'auto'],
                'kernel': ['rbf']
            }
        },
        'MultinomialNB': {
            'model': MultinomialNB(),
            'params': {
                'alpha': [0.01, 0.1, 0.5, 1.0]
            }
        },
        'RandomForest': {
            'model': RandomForestClassifier(),
            'params': {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10]
            }
        },
        'GradientBoosting': {
            'model': GradientBoostingClassifier(),
            'params': {
                'n_estimators': [100],
                'learning_rate': [0.1, 0.5],
                'max_depth': [3, 5,]
            }
        }
    }
    
    best_models = {}
  
    
    # Perform hyperparameter tuning
    for name, cfg in param_grids.items():
        grid_search = GridSearchCV(cfg['model'], cfg['params'], cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_resampled, y_resampled)
        
        best_models[name] = grid_search.best_estimator_
    
    return best_models

# Example usage
best_models = train_best_model(X_resampled, y_resampled)


# Printing the accuracy,recall and confusion matrix of each model

In [5]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix

def predict_and_evaluate(best_models, X_test_tfidf, y_test):
    results = {}
    
    for name, model in best_models.items():
        y_pred = model.predict(X_test_tfidf)
        accuracy = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        
        results[name] = {
            'accuracy': accuracy,
            'recall': recall,
            'confusion_matrix': cm
        }
    
    return results

def print_results(results):
    for name, metrics in results.items():
        accuracy = metrics['accuracy']
        recall = metrics['recall']
        cm = metrics['confusion_matrix']
        
        print(f'===== Results for {name} =====')
        print(f'Accuracy: {accuracy:.3f}')
        print(f'Recall: {recall:.3f}')
        print('Confusion Matrix:')
        print(cm)
        print('\n')

# Example usage
results = predict_and_evaluate(best_models, X_test_tfidf, y_test)
print_results(results)


===== Results for LogisticRegression =====
Accuracy: 0.821
Recall: 0.542
Confusion Matrix:
[[3274  526]
 [ 254  301]]


===== Results for SVC =====
Accuracy: 0.886
Recall: 0.213
Confusion Matrix:
[[3742   58]
 [ 437  118]]


===== Results for MultinomialNB =====
Accuracy: 0.754
Recall: 0.573
Confusion Matrix:
[[2966  834]
 [ 237  318]]


===== Results for RandomForest =====
Accuracy: 0.878
Recall: 0.187
Confusion Matrix:
[[3721   79]
 [ 451  104]]


===== Results for GradientBoosting =====
Accuracy: 0.850
Recall: 0.501
Confusion Matrix:
[[3422  378]
 [ 277  278]]




# Conclusion

Gradient Boosting with 85% of accuracy is Performing best compare to all other different Models.
There are models such as Random forest and SVM that have greater accuracy than GB but they are unable to classify the minority class at all,they are biased toward majority class.