In [None]:
pip install imbalanced-learn




## Load the Data

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/preprocessed_dataset.csv')

# Define the text and target columns
text_column = 'cleaned_comment'
target_column = 'labels'


## Apply Text Preprocessing

In [None]:
import nltk

# Download the stopwords resource
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download the stopwords resource
nltk.download('stopwords')

# Define a function to clean and preprocess the text data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove punctuation and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a single string
    text = ' '.join(filtered_tokens)
    return text

# Apply text preprocessing
df[text_column] = df[text_column].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Vectorize the Text Data Using TF-IDF

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split



# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df[text_column])
y = df[target_column]



## Apply SMOTE to the TF-IDF Vectorized Data

In [None]:
# Apply SMOTE to the TF-IDF vectorized data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Now X_train, X_test, y_train, and y_test are ready for model training and evaluation


##  Build and Evaluate Machine Learning Models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

def build_and_evaluate_ml_model(X_train, X_test, y_train, y_test, model_type='random_forest'):
    if model_type == 'random_forest':
        model = RandomForestClassifier(random_state=42)
    elif model_type == 'logistic_regression':
        model = LogisticRegression(max_iter=1000, random_state=42)
    elif model_type == 'svm':
        model = SVC(random_state=42)
    else:
        raise ValueError("Unsupported model type. Choose from 'random_forest', 'logistic_regression', or 'svm'.")

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    print(f"Model: {model_type}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

    return model



In [None]:
# function calling
rf_model = build_and_evaluate_ml_model(X_train, X_test, y_train, y_test, model_type='random_forest')
lr_model = build_and_evaluate_ml_model(X_train, X_test, y_train, y_test, model_type='logistic_regression')
svm_model = build_and_evaluate_ml_model(X_train, X_test, y_train, y_test, model_type='svm')


Model: random_forest
Accuracy: 0.7660924750679964
              precision    recall  f1-score   support

           0       0.75      0.78      0.77      2189
           1       0.78      0.75      0.76      2223

    accuracy                           0.77      4412
   macro avg       0.77      0.77      0.77      4412
weighted avg       0.77      0.77      0.77      4412

Model: logistic_regression
Accuracy: 0.772438803263826
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      2189
           1       0.78      0.77      0.77      2223

    accuracy                           0.77      4412
   macro avg       0.77      0.77      0.77      4412
weighted avg       0.77      0.77      0.77      4412

Model: svm
Accuracy: 0.8125566636446057
              precision    recall  f1-score   support

           0       0.79      0.85      0.82      2189
           1       0.84      0.78      0.81      2223

    accuracy                         

## Hyperparameter Tuning and Model Evaluation

In [None]:
import joblib
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import uniform, loguniform

# Function to perform hyperparameter tuning and evaluate models
def tune_and_evaluate_model(X_train, X_test, y_train, y_test, model, param_dist):
    with joblib.parallel_backend('threading'):
        random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=20, cv=3, n_jobs=-1, verbose=2, scoring='accuracy', random_state=42)
        random_search.fit(X_train, y_train)

    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)

    print(f"Best Model: {best_model}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

    return best_model

# SVM Hyperparameter Tuning with RandomizedSearchCV
svm_param_dist = {
    'C': loguniform(0.1, 100),
    'gamma': loguniform(0.001, 1),
    'kernel': ['rbf', 'linear']
}

svm_best_model = tune_and_evaluate_model(X_train, X_test, y_train, y_test, SVC(random_state=42), svm_param_dist)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END C=1.3292918943162166, gamma=0.711447600934342, kernel=rbf; total time=  29.4s
[CV] END C=1.3292918943162166, gamma=0.711447600934342, kernel=rbf; total time=  29.8s
[CV] END C=1.3292918943162166, gamma=0.711447600934342, kernel=rbf; total time=  31.6s
[CV] END C=21.830968390524596, gamma=0.061737703947045704, kernel=linear; total time=  50.2s
[CV] END C=21.830968390524596, gamma=0.061737703947045704, kernel=linear; total time=  43.8s
[CV] END C=0.2937538457632829, gamma=0.0014936568554617625, kernel=linear; total time=  20.7s
[CV] END C=21.830968390524596, gamma=0.061737703947045704, kernel=linear; total time=  50.0s
[CV] END C=0.2937538457632829, gamma=0.0014936568554617625, kernel=linear; total time=  17.6s
[CV] END C=0.2937538457632829, gamma=0.0014936568554617625, kernel=linear; total time=  19.8s
[CV] END C=1.0025956902289566, gamma=0.0026828750938254387, kernel=rbf; total time=  28.9s
[CV] END C=1.0025956902289

### Conclusion

- The SVM model outperforms both the Random Forest and Logistic Regression models in terms of accuracy and F1-score.
- SVM shows a balanced performance with good precision and recall for both classes (0 and 1), indicating robust classification capabilities.
- Random Forest and Logistic Regression models also perform reasonably well, but SVM provides slightly better performance metrics across accuracy, precision, recall, and F1-score.
  
In summary, based on this evaluation, the SVM model is recommended for this classification task due to its superior performance metrics compared to Random Forest and Logistic Regression models.





