# ML Models

In [6]:
#import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import pickle

In [7]:
# Load the cleaned dataset
df = pd.read_csv('C:\\group-1-main\\Data-Preprocessing\\cleaned_data.csv')
df['tweet'] = df['tweet'].astype(str)

## TF - IDF ENCODING

In [8]:
def tfidf_encoding(df, text_column):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[text_column])
    return tfidf_matrix

# Apply TF-IDF encoding
text_column = 'tweet'
tfidf_matrix = tfidf_encoding(df, text_column)

## Prepare data for modeling

In [9]:
X = tfidf_matrix
y = df['class']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Address class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

## Function to train and evaluvate models

In [10]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=['Normal', 'Hate', 'Offensive'])
    accuracy = accuracy_score(y_test, y_pred)
    return report, accuracy

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Support Vector Machine': SVC(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Evaluate models
results = {}
for model_name, model in models.items():
    report, accuracy = evaluate_model(model, X_train_res, X_test, y_train_res, y_test)
    results[model_name] = {'report': report, 'accuracy': accuracy}
    print(f"Results for {model_name}:")
    print(report)
    print(f"Accuracy: {accuracy}\n")




Results for Logistic Regression:
              precision    recall  f1-score   support

      Normal       0.33      0.49      0.39       282
        Hate       0.96      0.88      0.92      3798
   Offensive       0.76      0.92      0.84       874

    accuracy                           0.86      4954
   macro avg       0.68      0.76      0.71      4954
weighted avg       0.89      0.86      0.87      4954

Accuracy: 0.8625353249899071

Results for Support Vector Machine:
              precision    recall  f1-score   support

      Normal       0.49      0.13      0.21       282
        Hate       0.91      0.96      0.94      3798
   Offensive       0.84      0.83      0.83       874

    accuracy                           0.89      4954
   macro avg       0.75      0.64      0.66      4954
weighted avg       0.87      0.89      0.88      4954

Accuracy: 0.8911990310859911

Results for Random Forest:
              precision    recall  f1-score   support

      Normal       0.48    

## Save the best performing model

In [11]:
best_model_name = max(results, key=lambda k: results[k]['accuracy'])
best_model = models[best_model_name]
with open('best_ml_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print(f"Best model ({best_model_name}) saved to best_ml_model.pkl")

Best model (Random Forest) saved to best_ml_model.pkl
