<a href="https://colab.research.google.com/github/prasadanvekar/elvtrdocs/blob/main/MLTextModeration_youtubecomments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Objective: To implement Text Moderation using Juypter Notebook

#First step: For this implementation, we will be using Youtube toxic comment dataset from Kaggle.
# https://www.kaggle.com/datasets/reihanenamdari/youtube-toxicity-data?resource=download

# Second Step: To upload the data to Google Colab environment as a pre-requisite.


In [16]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load Data
DATA_PATH = "youtoxic_english_1000.csv"

if os.path.exists(DATA_PATH):
    # Read from file if we've already downloaded the data.
    with open(DATA_PATH) as f:
        df = pd.read_csv(DATA_PATH)

# Display first few rows to validate the data
print(df.head(5))

# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values
df.dropna(inplace=True)

# Check for duplicate rows
print(df.duplicated().sum())

# Drop duplicate rows
df.drop_duplicates(inplace=True)

# Check for class imbalance
print(df['IsToxic'].value_counts())

# Print the column names to verify 'text' exists
print(df.columns)  # Check if 'text' is present and correctly named

# prompt: Using dataframe df: IsToxic
df['IsToxic'].value_counts() # Count the number of True and False values in the IsToxic column


# Download NLTK resources
import nltk
nltk.download('stopwords')

nltk.download('punkt')

# Preprocess text
def preprocess_text(Text):
    # Tokenization
    tokens = word_tokenize(Text)
    # Remove punctuation and lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Apply preprocessing
df['processed_text'] = df['Text'].apply(preprocess_text)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['IsToxic'], test_size=0.2, random_state=42)

# Vectorize text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Classification report
print(classification_report(y_test, y_pred))



              CommentId      VideoId  \
0  Ugg2KwwX0V8-aXgCoAEC  04kJtp6pVXI   
1  Ugg2s5AzSPioEXgCoAEC  04kJtp6pVXI   
2  Ugg3dWTOxryFfHgCoAEC  04kJtp6pVXI   
3  Ugg7Gd006w1MPngCoAEC  04kJtp6pVXI   
4  Ugg8FfTbbNF8IngCoAEC  04kJtp6pVXI   

                                                Text  IsToxic  IsAbusive  \
0  If only people would just take a step back and...    False      False   
1  Law enforcement is not trained to shoot to app...     True       True   
2  \nDont you reckon them 'black lives matter' ba...     True       True   
3  There are a very large number of people who do...    False      False   
4  The Arab dude is absolutely right, he should h...    False      False   

   IsThreat  IsProvocative  IsObscene  IsHatespeech  IsRacist  IsNationalist  \
0     False          False      False         False     False          False   
1     False          False      False         False     False          False   
2     False          False       True         False     False 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.66
              precision    recall  f1-score   support

       False       0.60      0.81      0.69        93
        True       0.76      0.53      0.63       107

    accuracy                           0.66       200
   macro avg       0.68      0.67      0.66       200
weighted avg       0.69      0.66      0.66       200

