In [8]:
# Basic Python package
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string 
import re

In [9]:
# File paths
train_file = r"C:\Users\prits\Downloads\Data\cleaned_ghc_train.csv"
test_file = r'C:\Users\prits\Downloads\Data\cleaned_ghc_test.csv'

# Load CSV files into pandas DataFrames
train_df = pd.read_csv(train_file)

In [10]:
import numpy as np


train_df_cleaned = train_df.dropna()
X = train_df_cleaned ['text']
y = train_df_cleaned ['label']

# Using shape attribute
print("Shape of X:", X.shape)
print("Shape of X:", y.shape)

Shape of X: (21772,)
Shape of X: (21772,)


In [11]:
from sklearn.model_selection import train_test_split
# Training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorizing using TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [13]:
from imblearn.over_sampling import SMOTE

# Applying SMOTE to handle imbalanced data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train)
print(y_resampled.value_counts())

0    15297
1    15297
Name: label, dtype: int64


In [14]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_resampled, y_resampled)

LogisticRegression()

In [15]:
from sklearn.ensemble import RandomForestClassifier


In [16]:
rf=RandomForestClassifier(random_state=42)
rf.fit(X_resampled,y_resampled)

RandomForestClassifier(random_state=42)

In [17]:
rf.score(X_test_tfidf,y_test)

0.8810562571756602

In [18]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred=rf.predict(X_test_tfidf)
print(confusion_matrix(y_test,y_pred))

[[3774   51]
 [ 467   63]]


In [19]:
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94      3825
           1       0.55      0.12      0.20       530

    accuracy                           0.88      4355
   macro avg       0.72      0.55      0.57      4355
weighted avg       0.85      0.88      0.85      4355



In [23]:
param_grid = {
    'n_estimators': [100, 150,200,300,350,400], 
    'max_features': [1,2,'sqrt', 'log2', None], 
    'max_depth': [4, 6, 10,15,20], 
    'max_leaf_nodes': [2, 4, 6,12,20]
    
}

In [24]:
from sklearn.model_selection import GridSearchCV
rf_grid =GridSearchCV(estimator = rf, param_grid=param_grid,cv=5,verbose=2,n_jobs=-1)

In [25]:
rf_grid.fit(X_resampled,y_resampled)

Fitting 5 folds for each of 750 candidates, totalling 3750 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [4, 6, 10, 15, 20],
                         'max_features': [1, 2, 'sqrt', 'log2', None],
                         'max_leaf_nodes': [2, 4, 6, 12, 20],
                         'n_estimators': [100, 150, 200, 300, 350, 400]},
             verbose=2)

In [26]:
rf_grid.best_score_

0.8350667573243987

In [27]:
y_pred = rf_grid.predict(X_test_tfidf)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.84      0.87      3825
           1       0.25      0.39      0.30       530

    accuracy                           0.78      4355
   macro avg       0.58      0.61      0.59      4355
weighted avg       0.83      0.78      0.80      4355

Confusion Matrix:
[[3203  622]
 [ 325  205]]
