In [1]:
# Basic Python package
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string 
import re

In [2]:
# File paths
train_file = r"C:\Users\prits\Downloads\Data\cleaned_ghc_train.csv"
test_file = r'C:\Users\prits\Downloads\Data\cleaned_ghc_test.csv'

# Load CSV files into pandas DataFrames
train_df = pd.read_csv(train_file)

In [3]:
import numpy as np


train_df_cleaned = train_df.dropna()
X = train_df_cleaned ['text']
y = train_df_cleaned ['label']

# Using shape attribute
print("Shape of X:", X.shape)
print("Shape of X:", y.shape)

Shape of X: (21772,)
Shape of X: (21772,)


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# Training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# Vectorizing using TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [6]:
from imblearn.over_sampling import SMOTE

# Applying SMOTE to handle imbalanced data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train)
print(y_resampled.value_counts())

0    15297
1    15297
Name: label, dtype: int64


In [7]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_resampled, y_resampled)


LogisticRegression()

In [8]:
print('Accuracy:',model.score(X_test_tfidf ,y_test))

Accuracy: 0.7745120551090701


In [9]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred=model.predict(X_test_tfidf )
print(confusion_matrix(y_test,y_pred))

[[3104  721]
 [ 261  269]]


In [10]:
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.81      0.86      3825
           1       0.27      0.51      0.35       530

    accuracy                           0.77      4355
   macro avg       0.60      0.66      0.61      4355
weighted avg       0.84      0.77      0.80      4355



# SVM

In [11]:
# Import the SVC (Support Vector Classifier) class from sklearn.svm
from sklearn.svm import SVC
svc=SVC(random_state=42,probability=True)
svc.fit(X_resampled, y_resampled)

SVC(probability=True, random_state=42)

In [12]:
svc.score(X_test_tfidf,y_test)

0.878300803673938

In [13]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred=svc.predict(X_test_tfidf)
print(confusion_matrix(y_test,y_pred))

[[3755   70]
 [ 460   70]]


In [14]:
print(svc)


SVC(probability=True, random_state=42)


# SVM hypertunning

In [19]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

# Initialize the SVM model
svm = SVC()

# Initialize GridSearchCV with the SVM model and parameter grid
grid = GridSearchCV(svm, param_grid, refit=True, verbose=2, cv=5)

# Fit the model
grid.fit(X_resampled, y_resampled)



Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 1.5min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 2.2min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 1.9min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 2.2min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 1.8min
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 1.7min
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 1.8min
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 1.8min
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 1.8min
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 1.8min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 1.3min
[CV] END .......................C=0.1, gamma=0.

[CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 3.5min
[CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 3.4min
[CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 3.5min
[CV] END .......................C=10, gamma=0.01, kernel=rbf; total time= 3.4min
[CV] END ....................C=10, gamma=0.01, kernel=linear; total time=10.2min
[CV] END ....................C=10, gamma=0.01, kernel=linear; total time=10.9min
[CV] END ....................C=10, gamma=0.01, kernel=linear; total time= 6.4min
[CV] END ....................C=10, gamma=0.01, kernel=linear; total time= 5.1min
[CV] END ....................C=10, gamma=0.01, kernel=linear; total time= 5.4min
[CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 2.6min
[CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 2.5min
[CV] END ......................C=10, gamma=0.001, kernel=rbf; total time= 2.6min
[CV] END ...................

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['rbf', 'linear']},
             verbose=2)

In [20]:
# Make predictions with the best model
y_pred = grid.predict(X_test_tfidf)

In [21]:
# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.97      0.94      3825
           1       0.55      0.24      0.33       530

    accuracy                           0.88      4355
   macro avg       0.73      0.61      0.63      4355
weighted avg       0.86      0.88      0.86      4355



In [23]:
svc.score(X_test_tfidf,y_test)

0.878300803673938

In [22]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test,y_pred))

[[3722  103]
 [ 404  126]]
