### Load the Data

In [35]:
pip install imbalanced-learn




In [36]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/preprocessed_dataset.csv')

# Define the text and target columns
text_column = 'cleaned_comment'
target_column = 'labels'


### Preprocess the Text Data

In [9]:
import nltk

# Download the stopwords resource
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download the stopwords resource
nltk.download('stopwords')

# Define a function to clean and preprocess the text data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove punctuation and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a single string
    text = ' '.join(filtered_tokens)
    return text

# Apply text preprocessing
df[text_column] = df[text_column].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Vectorize the Text Data Using TF-IDF

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df[text_column])
y = df[target_column]


### Handling Imbalanced Data for Machine Learning Models

In [40]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Apply SMOTE to the TF-IDF vectorized data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

# Split the data into training and testing sets
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


###  Function to Build and Evaluate Machine Learning Models

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

def build_and_evaluate_ml_model(X_train, X_test, y_train, y_test, model_type='random_forest'):
    if model_type == 'random_forest':
        model = RandomForestClassifier(random_state=42)
    elif model_type == 'logistic_regression':
        model = LogisticRegression(max_iter=1000, random_state=42)
    elif model_type == 'svm':
        model = SVC(random_state=42)
    else:
        raise ValueError("Unsupported model type. Choose from 'random_forest', 'logistic_regression', or 'svm'.")

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    print(f"Model: {model_type}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

    return model

# # Example usage
# rf_model = build_and_evaluate_ml_model(X_train_ml, X_test_ml, y_train_ml, y_test_ml, model_type='random_forest')
# lr_model = build_and_evaluate_ml_model(X_train_ml, X_test_ml, y_train_ml, y_test_ml, model_type='logistic_regression')
# svm_model = build_and_evaluate_ml_model(X_train_ml, X_test_ml, y_train_ml, y_test_ml, model_type='svm')


### Handling Imbalanced Data for Deep Learning Models
 Tokenize and Pad the Text Dat

In [42]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from imblearn.over_sampling import SMOTE

# # Tokenize the text data
# tokenizer = Tokenizer(num_words=5000)
# tokenizer.fit_on_texts(df[text_column])
# X_tokenized = tokenizer.texts_to_sequences(df[text_column])

# # Pad the sequences
# maxlen = 100
# X_padded = pad_sequences(X_tokenized, padding='post', maxlen=maxlen)

# # Apply SMOTE to the padded sequences
# smote = SMOTE(random_state=42)
# X_resampled_dl, y_resampled_dl = smote.fit_resample(X_padded, df[target_column])

# # Split the data into training and testing sets
# X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_resampled_dl, y_resampled_dl, test_size=0.2, random_state=42)


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df[text_column])
y = df[target_column]

# Apply SMOTE to the data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

# Split the data into training and testing sets
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


### Function to Build and Evaluate Deep Learning Models

In [44]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from sklearn.preprocessing import LabelEncoder

def build_and_evaluate_dl_model(X_train, X_test, y_train, y_test):
    # Convert target to categorical if necessary
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)

    # Define the model
    model = Sequential([
        Embedding(input_dim=5000, output_dim=16, input_length=maxlen),
        GlobalAveragePooling1D(),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Deep Learning Model Accuracy: {accuracy}")

    return model

# Example usage
#dl_model = build_and_evaluate_dl_model(X_train_dl, X_test_dl, y_train_dl, y_test_dl)


 ### Function Calls to Generate Models

In [45]:
# For Machine Learning models
rf_model = build_and_evaluate_ml_model(X_train, X_test, y_train, y_test, model_type='random_forest')
lr_model = build_and_evaluate_ml_model(X_train, X_test, y_train, y_test, model_type='logistic_regression')
svm_model = build_and_evaluate_ml_model(X_train, X_test, y_train, y_test, model_type='svm')



Model: random_forest
Accuracy: 0.7660924750679964
              precision    recall  f1-score   support

           0       0.75      0.78      0.77      2189
           1       0.78      0.75      0.76      2223

    accuracy                           0.77      4412
   macro avg       0.77      0.77      0.77      4412
weighted avg       0.77      0.77      0.77      4412

Model: logistic_regression
Accuracy: 0.772438803263826
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      2189
           1       0.78      0.77      0.77      2223

    accuracy                           0.77      4412
   macro avg       0.77      0.77      0.77      4412
weighted avg       0.77      0.77      0.77      4412

Model: svm
Accuracy: 0.8125566636446057
              precision    recall  f1-score   support

           0       0.79      0.85      0.82      2189
           1       0.84      0.78      0.81      2223

    accuracy                         

In [47]:
# For Deep Learning model
dl_model = build_and_evaluate_dl_model(X_train_dl, X_test_dl, y_train_dl, y_test_dl)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Deep Learning Model Accuracy: 0.7436536550521851


Performing Varoius practices

In [25]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

def build_and_evaluate_ml_model(X_train, X_test, y_train, y_test, model_type='random_forest'):
    if model_type == 'random_forest':
        model = RandomForestClassifier(random_state=42)
    elif model_type == 'logistic_regression':
        model = LogisticRegression(max_iter=1000, random_state=42)
    elif model_type == 'svm':
        model = SVC(random_state=42)
    elif model_type == 'gradient_boosting':
        model = GradientBoostingClassifier(random_state=42)
    else:
        raise ValueError("Unsupported model type. Choose from 'random_forest', 'logistic_regression', 'svm', or 'gradient_boosting'.")

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    print(f"Model: {model_type}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

    return model

# Example usage
rf_model = build_and_evaluate_ml_model(X_train_ml, X_test_ml, y_train_ml, y_test_ml, model_type='random_forest')
lr_model = build_and_evaluate_ml_model(X_train_ml, X_test_ml, y_train_ml, y_test_ml, model_type='logistic_regression')
svm_model = build_and_evaluate_ml_model(X_train_ml, X_test_ml, y_train_ml, y_test_ml, model_type='svm')
gb_model = build_and_evaluate_ml_model(X_train_ml, X_test_ml, y_train_ml, y_test_ml, model_type='gradient_boosting')


Model: random_forest
Accuracy: 0.7660924750679964
              precision    recall  f1-score   support

           0       0.75      0.78      0.77      2189
           1       0.78      0.75      0.76      2223

    accuracy                           0.77      4412
   macro avg       0.77      0.77      0.77      4412
weighted avg       0.77      0.77      0.77      4412

Model: logistic_regression
Accuracy: 0.772438803263826
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      2189
           1       0.78      0.77      0.77      2223

    accuracy                           0.77      4412
   macro avg       0.77      0.77      0.77      4412
weighted avg       0.77      0.77      0.77      4412

Model: svm
Accuracy: 0.8125566636446057
              precision    recall  f1-score   support

           0       0.79      0.85      0.82      2189
           1       0.84      0.78      0.81      2223

    accuracy                         

In [27]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
import numpy as np

def build_and_evaluate_dl_model(X_train, X_test, y_train, y_test):
    # Convert target to categorical if necessary
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)

    # Calculate class weights
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train),
        y=y_train
    )
    class_weights = {i: weight for i, weight in enumerate(class_weights)}

    # Define the model
    model = Sequential([
        Embedding(input_dim=5000, output_dim=16, input_length=maxlen),
        GlobalAveragePooling1D(),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model with class weights
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, class_weight=class_weights, verbose=1)

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Deep Learning Model Accuracy: {accuracy}")

    return model

# Example usage
dl_model = build_and_evaluate_dl_model(X_train_dl, X_test_dl, y_train_dl, y_test_dl)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Deep Learning Model Accuracy: 0.7441070079803467


In [34]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer

# Function for WordPiece tokenization
def wordpiece_tokenize(texts, tokenizer, max_length):
    tokenized_texts = tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='np'
    )
    return tokenized_texts['input_ids']

def build_and_evaluate_dl_model(X_train, X_test, y_train, y_test, maxlen=100):
    # Convert target to categorical if necessary
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)

    # Ensure input data is in list of strings format
    X_train = [str(doc) for doc in X_train]
    X_test = [str(doc) for doc in X_test]

    # TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
    X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

    # WordPiece Tokenization
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    X_train_tokenized = wordpiece_tokenize(X_train, tokenizer, max_length=maxlen)
    X_test_tokenized = wordpiece_tokenize(X_test, tokenizer, max_length=maxlen)

    # Combine TF-IDF and Tokenized features (you may choose to use one or both)
    X_train_combined = np.hstack((X_train_tfidf, X_train_tokenized))
    X_test_combined = np.hstack((X_test_tfidf, X_test_tokenized))

    # Calculate class weights
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train),
        y=y_train
    )
    class_weights = {i: weight for i, weight in enumerate(class_weights)}

    # Define the model
    model = Sequential([
        Dense(512, activation='relu', input_shape=(X_train_combined.shape[1],)),
        Dense(256, activation='relu'),
        Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model with class weights
    model.fit(X_train_combined, y_train, epochs=10, batch_size=32, validation_split=0.2, class_weight=class_weights, verbose=1)

    # Evaluate the model
    y_pred = (model.predict(X_test_combined) > 0.5).astype("int32")
    accuracy = tf.keras.metrics.BinaryAccuracy()
    precision = tf.keras.metrics.Precision()
    recall = tf.keras.metrics.Recall()
    f1_score = tf.keras.metrics.Mean()

    accuracy.update_state(y_test, y_pred)
    precision.update_state(y_test, y_pred)
    recall.update_state(y_test, y_pred)
    f1 = 2 * (precision.result().numpy() * recall.result().numpy()) / (precision.result().numpy() + recall.result().numpy())
    f1_score.update_state(f1)

    print(f"Deep Learning Model Accuracy: {accuracy.result().numpy()}")
    print(f"Deep Learning Model Precision: {precision.result().numpy()}")
    print(f"Deep Learning Model Recall: {recall.result().numpy()}")
    print(f"Deep Learning Model F1 Score: {f1_score.result().numpy()}")

    return model

# Example usage
# Ensure X_train_dl and X_test_dl are lists of strings, and y_train_dl and y_test_dl are labels
dl_model = build_and_evaluate_dl_model(X_train_dl, X_test_dl, y_train_dl, y_test_dl)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Deep Learning Model Accuracy: 0.5
Deep Learning Model Precision: 0.5
Deep Learning Model Recall: 1.0
Deep Learning Model F1 Score: 0.6666666865348816
