<a href="https://colab.research.google.com/github/saisiridasari/Portfolio-new/blob/main/Hate_Speech_Identifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
import re
import joblib
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Download necessary NLTK data
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')


# Text Preprocessing Class
class TextCleaner:
    def __init__(self):
        self.stop_words = set(stopwords.words('english')) - {'not', 'no'}
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        text = re.sub(r"http\S+|www\S+|@\w+", "", text)
        text = re.sub(r"[^a-zA-Z\s]", "", text)
        text = text.lower()
        words = nltk.word_tokenize(text)
        words = [self.lemmatizer.lemmatize(w) for w in words if w not in self.stop_words and len(w) > 1]
        return " ".join(words)


# Load and Prepare Data

def load_and_prepare_data(filepath):
    df = pd.read_csv(filepath)

    # Ensure required columns exist
    if 'tweet' not in df.columns or 'class' not in df.columns:
        raise ValueError("CSV must contain 'tweet' and 'class' columns")

    df = df.dropna(subset=['tweet'])

    cleaner = TextCleaner()
    df['clean_text'] = df['tweet'].apply(cleaner.clean_text)

    vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
    X = vectorizer.fit_transform(df['clean_text'])
    y = df['class']

    return X, y, vectorizer, cleaner


# Train Model

def train_and_evaluate(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    model = LogisticRegression(max_iter=1000, class_weight='balanced')
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

    return model

# Prediction

def predict(text, model, vectorizer, cleaner):
    clean_text = cleaner.clean_text(text)
    X = vectorizer.transform([clean_text])
    pred = model.predict(X)[0]
    proba = model.predict_proba(X).max()

    labels = {0: "Hate Speech", 1: "Offensive", 2: "Neither"}
    return {
        "label": labels[pred],
        "confidence": float(proba)
    }


# Main Execution

if __name__ == "__main__":
    # Step 1: Load data
    X, y, vectorizer, cleaner = load_and_prepare_data("labeled_data.csv")

    model = train_and_evaluate(X, y)

    sample_texts = [
        "I hope you have a great day!",
        "Let’s meet up for lunch tomorrow.",
        "You're such an idiot, get lost!",
        "Shut up already, no one cares!",
        "All [ethnic group] are criminals!",
        "I hate all [religion] people, they're disgusting.",
        "People like you ruin this country!",
        "Congratulations on your achievement!"
    ]

    print("\nSample Predictions:")
    for text in sample_texts:
        print(f"Text: {text} -> Prediction: {predict(text, model, vectorizer, cleaner)}")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


              precision    recall  f1-score   support

           0       0.32      0.62      0.43       286
           1       0.97      0.85      0.91      3838
           2       0.76      0.95      0.84       833

    accuracy                           0.85      4957
   macro avg       0.68      0.81      0.72      4957
weighted avg       0.90      0.85      0.87      4957

Accuracy: 0.8525

Sample Predictions:
Text: I hope you have a great day! -> Prediction: {'label': 'Neither', 'confidence': 0.8104424493789826}
Text: Let’s meet up for lunch tomorrow. -> Prediction: {'label': 'Neither', 'confidence': 0.6595426366726486}
Text: You're such an idiot, get lost! -> Prediction: {'label': 'Offensive', 'confidence': 0.37771679702426164}
Text: Shut up already, no one cares! -> Prediction: {'label': 'Offensive', 'confidence': 0.4978445162482476}
Text: All [ethnic group] are criminals! -> Prediction: {'label': 'Neither', 'confidence': 0.7562528684033519}
Text: I hate all [religion] people, 

In [12]:
import pickle

# Save model and vectorizer
with open("hate_speech_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("tfidf_vectorizer.pkl", "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)

from google.colab import files
files.download("hate_speech_model.pkl")
files.download("tfidf_vectorizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>