In [7]:
import re
import string
import pickle
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from abbreivations import replacement_dict  # Ensure this import points to your actual replacement_dict

# Define the preprocessing function
def preprocess_text(text, replacement_dict):
    def remove_usernames(tweet):
        return re.sub(r'@\w+', '', tweet)

    def remove_sequence(tweet):
        return re.sub(r'\b\d+\b', '', tweet)

    def remove_urls(tweet):
        return re.sub(r'http\S+|www\S+|https\S+|@\w+|#\w+', '', tweet, flags=re.MULTILINE)

    def remove_punctuations(text):
        punctuations_list = string.punctuation
        temp = str.maketrans('', '', punctuations_list)
        return text.translate(temp)

    def replace_words(tweet, abbreviations_dict):
        pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in abbreviations_dict.keys()) + r')\b')
        return pattern.sub(lambda x: abbreviations_dict[x.group()], tweet)

    def remove_stopwords(text):
        stop_words = stopwords.words('english')
        imp_words = []
        lemmatizer = WordNetLemmatizer()
        for word in str(text).split():
            if word not in stop_words:
                imp_words.append(lemmatizer.lemmatize(word))
        output = " ".join(imp_words)
        return output

    text = remove_usernames(text)
    text = remove_sequence(text)
    text = remove_urls(text)
    text = text.lower()
    text = remove_punctuations(text)
    text = replace_words(text, replacement_dict)
    text = remove_stopwords(text)
    return text



In [9]:

# Load the trained model and tokenizer
model = load_model('trained_model.h5')
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Define the prediction function
def predict_text(text):
    preprocessed_text = preprocess_text(text, replacement_dict)
    text_seq = tokenizer.texts_to_sequences([preprocessed_text])
    text_pad = pad_sequences(text_seq, maxlen=100)  # Ensure maxlen is the same as used during training
    prediction = model.predict(text_pad)
    predicted_class = np.argmax(prediction, axis=1)
    label_mapping = {0: 'Non Hate Speech', 1: 'Hate Speech'}
    predicted_label = label_mapping[predicted_class[0]]
    return predicted_label

# Example usage
text = "i love u"
predicted_label = predict_text(text)
print(f"Predicted Label: {predicted_label}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 782ms/step
Predicted Label: Hate Speech
