# Import Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Load Data

In [2]:
X_data = pd.read_csv('Final.csv').reset_index(drop=False)[['index','cleaned_text','label']]

# Split Data

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X_data['cleaned_text'].values, X_data['label'].values, test_size=0.2, random_state=142, shuffle=True, stratify=X_data['label'].values)

# Vectorize Data

In [4]:
def tfidf_extractor(corpus, min_df, encoding, ngram_range=(1,2), max_df=1.0):
    vectorizer = TfidfVectorizer(preprocessor=None, lowercase=False, min_df=min_df, max_df=max_df, 
                                 norm='l2', smooth_idf=True, use_idf=True, ngram_range=ngram_range, encoding=encoding, 
                                 sublinear_tf=True)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features 

def display_features(features, feature_names):
    df = pd.DataFrame(data=features, columns=feature_names)
    return df

In [5]:
tfidf_vectorizer, tfidf_X_features = tfidf_extractor(X_train, 10, 'utf-8', (1,2))   # min-df is set to 10

In [6]:
tfidf_X_features  # Notice that the number of features is 4971 if min-df is set to 10. As you reduce the value of min-df, the # of features increases so also is RAM consumption.

<25222x4971 sparse matrix of type '<class 'numpy.float64'>'
	with 340290 stored elements in Compressed Sparse Row format>

In [7]:
feature_names = tfidf_vectorizer.get_feature_names()
X_train_df = display_features(tfidf_X_features.todense(), feature_names)
X_train = X_train_df.values

In [8]:
tfidf_test_features = tfidf_vectorizer.transform(X_test)
X_test_df = display_features(tfidf_test_features.todense(), feature_names)
X_test = X_test_df.values

# Model Building

In [9]:
cv = StratifiedKFold(n_splits=10, random_state=26, shuffle=True)

In [10]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# Assuming X_train, y_train, and cv are already defined
classifiers = [
    ('Logistic Regression (liblinear)', LogisticRegression(solver='liblinear', max_iter=1000, random_state=45)),
    ('Logistic Regression (lbfgs)', LogisticRegression(solver='lbfgs', max_iter=1000, random_state=45)),
    ('Random Forest', RandomForestClassifier(random_state=45)),
    ('GaussianNB', GaussianNB()),
    ('MultinomialNB', MultinomialNB()),
    ('SGDClassifier', SGDClassifier(random_state=45))
]

# Loop through each classifier
for name, model in classifiers:
    print('*****', name, '*****')
    
    # Get the predicted labels using cross_val_predict
    y_pred = cross_val_predict(model, X=X_train, y=y_train, cv=cv)
    
    # Calculate precision, recall, and F1 score for both classes
    precision_0 = precision_score(y_train, y_pred, pos_label=0)
    recall_0 = recall_score(y_train, y_pred, pos_label=0)
    f1_0 = f1_score(y_train, y_pred, pos_label=0)
    
    precision_1 = precision_score(y_train, y_pred, pos_label=1)
    recall_1 = recall_score(y_train, y_pred, pos_label=1)
    f1_1 = f1_score(y_train, y_pred, pos_label=1)
    for score in ["roc_auc", "f1", "precision", "recall", "accuracy"]:
        cvs = cross_val_score(model, X=X_train, y=y_train, scoring=score, cv=cv).mean()
        print(score + " : "+ str(cvs))
    # Print metrics
    print(f'Class 0 - Precision: {precision_0:.3f}, Recall: {recall_0:.3f}, F1: {f1_0:.3f}')
    print(f'Class 1 - Precision: {precision_1:.3f}, Recall: {recall_1:.3f}, F1: {f1_1:.3f}')
    
    print('\n')


***** Logistic Regression (liblinear) *****
Class 0 - Precision: 0.871, Recall: 0.840, F1: 0.855
Class 1 - Precision: 0.846, Recall: 0.876, F1: 0.860


***** Logistic Regression (lbfgs) *****
Class 0 - Precision: 0.871, Recall: 0.840, F1: 0.855
Class 1 - Precision: 0.846, Recall: 0.876, F1: 0.860


***** Random Forest *****
Class 0 - Precision: 0.884, Recall: 0.850, F1: 0.867
Class 1 - Precision: 0.856, Recall: 0.889, F1: 0.872


***** GaussianNB *****
Class 0 - Precision: 0.892, Recall: 0.676, F1: 0.769
Class 1 - Precision: 0.739, Recall: 0.919, F1: 0.819


***** MultinomialNB *****
Class 0 - Precision: 0.882, Recall: 0.821, F1: 0.851
Class 1 - Precision: 0.833, Recall: 0.891, F1: 0.861


***** SGDClassifier *****
Class 0 - Precision: 0.886, Recall: 0.827, F1: 0.855
Class 1 - Precision: 0.838, Recall: 0.893, F1: 0.865


