GRADIENT BOOSTING

In [2]:
import pandas as pd
# File paths
train_file = r"C:\Users\BHARGAVI\Downloads\project_data\ghc_train.csv"
test_file = r"C:\Users\BHARGAVI\Downloads\project_data\ghc_test.csv"

# Load CSV files into pandas DataFrames
train_df = pd.read_csv(train_file)

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler

#Drop rows with NaN in 'text' column
train_df = train_df.dropna(subset=['text'])

# Step 2: Split the data into X (features) and y (labels)
X = train_df['text']
y = train_df['label']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

# Initialize TfidfVectorizer with adjusted parameters
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit and transform X_train (text data) using TfidfVectorizer
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform X_test using the fitted TfidfVectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Balance the training data 
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled= ros.fit_resample(X_train_tfidf, y_train)


HYPERTUNING OF GRADIENT BOOSTING

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

def train_best_gradient_boosting(X_resampled, y_resampled):
    # Define the parameter grid for Gradient Boosting
    param_grid = {
        'n_estimators': [100],
        'learning_rate': [0.1, 0.5],
        'max_depth': [3, 5]
    }
    
    # Create a Gradient Boosting model
    model = GradientBoostingClassifier()
    
    # Perform hyperparameter tuning using GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_resampled, y_resampled)
    
    # Get the best Gradient Boosting model
    best_model = grid_search.best_estimator_
    
    return best_model

# Example usage
best_gradient_boosting_model = train_best_gradient_boosting(X_resampled, y_resampled)


In [6]:
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

def predict_and_evaluate_gradient_boosting(best_gradient_boosting_model, X_test_tfidf, y_test):
    results = {}
    
    # Predict using the Gradient Boosting model
    y_pred = best_gradient_boosting_model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    results['GradientBoosting'] = {
        'accuracy': accuracy,
        'recall': recall,
        'confusion_matrix': cm
    }
    
    return results

def print_gradient_boosting_results(results):
    for name, metrics in results.items():
        accuracy = metrics['accuracy']
        recall = metrics['recall']
        cm = metrics['confusion_matrix']
        
        print(f'===== Results for {name} =====')
        print(f'Accuracy: {accuracy:.3f}')
        print(f'Recall: {recall:.3f}')
        print('Confusion Matrix:')
        print(cm)
        print('\n')

# Example usage
gradient_boosting_results = predict_and_evaluate_gradient_boosting(best_gradient_boosting_model, X_test_tfidf, y_test)
print_gradient_boosting_results(gradient_boosting_results)


===== Results for GradientBoosting =====
Accuracy: 0.855
Recall: 0.540
Confusion Matrix:
[[3436  389]
 [ 244  286]]


