In [1]:
# Basic Python package
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string 
import re


In [2]:
# File paths
train_file = r"C:\Users\BHARGAVI\Downloads\project_data\ghc_train.csv"
test_file = r"C:\Users\BHARGAVI\Downloads\project_data\ghc_test.csv"

# Load CSV files into pandas DataFrames
df = pd.read_csv(train_file)


In [3]:
df['label'].value_counts()


label
0    19126
1     2650
Name: count, dtype: int64

In [6]:
#class count
count_class_0,count_class_1=df.label.value_counts()
df_class_0=df[df['label']==0]
df_class_1=df[df['label']==1]
df_class_0.shape


(19126, 5)

In [7]:
df_class_1.shape


(2650, 5)

In [8]:
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under,df_class_1], axis=0)
df_test_under.shape
print('Random under sampling:')
print(df_test_under.label.value_counts())

Random under sampling:
label
0    2650
1    2650
Name: count, dtype: int64


In [9]:
X=df_test_under.drop('label',axis='columns')
y=df_test_under['label']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=15,stratify=y)


In [10]:
y_train.value_counts()

label
1    2120
0    2120
Name: count, dtype: int64

In [11]:
count_class_0,count_class_1


(19126, 2650)

In [12]:
df_class_1.sample(2000,replace=True).shape


(2000, 5)

In [13]:
count_class_0,count_class_1


(19126, 2650)

In [14]:
df_class_1_over=df_class_1.sample(count_class_0,replace=True)
df_test_over = pd.concat([df_class_0,df_class_1_over],axis=0)
df_test_over.shape


(38252, 5)

In [15]:
print('Random over Sampling:')
print(df_test_over.label.value_counts())

Random over Sampling:
label
0    19126
1    19126
Name: count, dtype: int64


In [17]:
#smote
columns_to_drop = [ 'text','label']
X=df.drop(columns=columns_to_drop)
y=df['label']

In [18]:
y.value_counts()

label
0    19126
1     2650
Name: count, dtype: int64

In [19]:
pip install -U imbalanced-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
from imblearn.over_sampling import SMOTE
# This means SMOTE will oversample the minority class until the number of instancesis equal to the number of instances in the majority class.
smote = SMOTE(sampling_strategy='minority')
# Apply SMOTE to the dataset (X, y).
# X_sm and y_sm will be the resampled feature matrix and target vector, respectively.
X_sm, y_sm = smote.fit_resample(X, y)

# Checking the value counts of the resampled y
y_sm_value_counts = y_sm.value_counts()
print(y_sm_value_counts)

label
0    19126
1    19126
Name: count, dtype: int64


In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler

# Assuming df is your DataFrame containing 'text' and 'label' columns

# Step 1: Drop rows with NaN in 'text' column
df = df.dropna(subset=['text'])

# Step 2: Split the data into X (features) and y (labels)
X = df['text']
y = df['label']

In [22]:
# Step 3: Split the data into training and testing sets
from sklearn.model_selection import train_test_split
# Training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [23]:
# Step 4: Initialize TfidfVectorizer with adjusted parameters
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Step 5: Fit and transform X_train (text data) using TfidfVectorizer
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Step 6: Transform X_test using the fitted TfidfVectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [24]:
# Step 7: Balance the training data
from imblearn.over_sampling import SMOTE

# Applying SMOTE to handle imbalanced data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train)
print(y_resampled.value_counts())

label
0    15297
1    15297
Name: count, dtype: int64


In [25]:
# Step 8: Initialize and train the Naive Bayes classifier (MultinomialNB) with tuned hyperparameters
model = MultinomialNB(alpha=0.1)
model.fit(X_resampled, y_resampled)

# Step 9: Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Step 10: Evaluate the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))


Accuracy: 0.7618828932261769
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.78      0.85      3825
           1       0.28      0.61      0.38       530

    accuracy                           0.76      4355
   macro avg       0.61      0.70      0.62      4355
weighted avg       0.86      0.76      0.80      4355

Confusion Matrix:
 [[2994  831]
 [ 206  324]]


In [27]:
from sklearn.pipeline import make_pipeline
# Create a pipeline with TfidfVectorizer and MultinomialNB
pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Define the grid of parameters to search
param_grid = {
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
    'multinomialnb__alpha': [0.1, 0.5, 1.0, 2.0]        # smoothing parameter
}


In [28]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Evaluate the model on the test data
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test set accuracy with best model: {:.2f}".format(test_accuracy))


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters found:  {'multinomialnb__alpha': 0.1, 'tfidfvectorizer__ngram_range': (1, 1)}
Best cross-validation score: 0.88
Test set accuracy with best model: 0.88
