LOGISTIC REGRESSION

In [1]:
import pandas as pd
# File paths
train_file = r"C:\Users\BHARGAVI\Downloads\project_data\ghc_train.csv"
test_file = r"C:\Users\BHARGAVI\Downloads\project_data\ghc_test.csv"

# Load CSV files into pandas DataFrames
train_df = pd.read_csv(train_file)

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler

#Drop rows with NaN in 'text' column
train_df = train_df.dropna(subset=['text'])

# Step 2: Split the data into X (features) and y (labels)
X = train_df['text']
y = train_df['label']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

# Initialize TfidfVectorizer with adjusted parameters
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit and transform X_train (text data) using TfidfVectorizer
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform X_test using the fitted TfidfVectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Balance the training data 
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled= ros.fit_resample(X_train_tfidf, y_train)


HYPERTUNING OF LOGISTIC REGRESSION

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

def train_best_logistic_regression(X_resampled, y_resampled):
    # Define the parameter grid for Logistic Regression
    param_grid = {
        'C': [0.1, 10],
        'solver': ['liblinear'],
        'penalty': ['l2']
    }
    
    # Create a Logistic Regression model
    model = LogisticRegression(max_iter=1000)
    
    # Perform hyperparameter tuning using GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_resampled, y_resampled)
    
    # Get the best Logistic Regression model
    best_model = grid_search.best_estimator_
    
    return best_model

# Example usage
best_logistic_regression_model = train_best_logistic_regression(X_resampled, y_resampled)


In [6]:
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

def predict_and_evaluate_logistic_regression(best_logistic_regression_model, X_test_tfidf, y_test):
    results = {}
    
    # Predict using the Logistic Regression model
    y_pred = best_logistic_regression_model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    results['LogisticRegression'] = {
        'accuracy': accuracy,
        'recall': recall,
        'confusion_matrix': cm
    }
    
    return results

def print_logistic_regression_results(results):
    for name, metrics in results.items():
        accuracy = metrics['accuracy']
        recall = metrics['recall']
        cm = metrics['confusion_matrix']
        
        print(f'===== Results for {name} =====')
        print(f'Accuracy: {accuracy:.3f}')
        print(f'Recall: {recall:.3f}')
        print('Confusion Matrix:')
        print(cm)
        print('\n')

# Example usage
logistic_regression_results = predict_and_evaluate_logistic_regression(best_logistic_regression_model, X_test_tfidf, y_test)
print_logistic_regression_results(logistic_regression_results)


===== Results for LogisticRegression =====
Accuracy: 0.826
Recall: 0.555
Confusion Matrix:
[[3304  521]
 [ 236  294]]


