In [26]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import re

# Load the dataset
data = pd.read_csv('/content/Hate Speech Detection in Arabic Urdu -  labeled_data.csv.csv')

# Preprocess the text
def preprocess(text):
    # Apply any necessary preprocessing steps such as removing special characters, normalization, etc.
    urdu_words_only = re.sub(r'[^\u0600-\u06FF\s]', '', text)
    return urdu_words_only.strip()

data['Tweet'] = data['Tweet'].apply(preprocess)

data["Labels"] = data["Class"].map({0:"Hate Speech", 1:"Offensive Language", 2:"No hate and Offensive Speech"})

# Split the data into training and test sets
X = data['Tweet']
y = data['Labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text
tokenizer = nltk.tokenize.WordPunctTokenizer()
X_train_tokens = [tokenizer.tokenize(text) for text in X_train]
X_test_tokens = [tokenizer.tokenize(text) for text in X_test]

# Convert labels to categorical
label_encoder = LabelEncoder()
y_train = y_train.astype(str)
y_test = y_test.astype(str)
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
num_classes = len(label_encoder.classes_)

# Convert tokens to sequences
max_sequence_length = 100  # Define the maximum sequence length
# Convert tokens to sequences and pad them
tokenizer = nltk.tokenize.WordPunctTokenizer()
from tensorflow.keras.preprocessing.text import Tokenizer

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Convert tokens to sequences and pad them
X_train_sequences = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_sequences = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

# Update the vocabulary size
vocabulary_size = len(tokenizer.word_index) + 1

# Define the model architecture (LSTM in this example)
model = Sequential()
model.add(Embedding(input_dim=vocabulary_size, output_dim=100, input_length=max_sequence_length))
model.add(LSTM(units=128))
model.add(Dense(units=num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_sequences, to_categorical(y_train_encoded, num_classes=num_classes), epochs=10, batch_size=32)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa467271780>

In [27]:
# Make predictions
y_pred_probs = model.predict(X_test_sequences)
y_pred = y_pred_probs.argmax(axis=1)

# Convert predictions back to original labels
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_labels)
precision = precision_score(y_test, y_pred_labels, average='weighted')
recall = recall_score(y_test, y_pred_labels, average='weighted')
f1 = f1_score(y_test, y_pred_labels, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.7730482146459552
Precision: 0.5976035421672988
Recall: 0.7730482146459552
F1-score: 0.6740973395206054


  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
text = "وہ اچھا لڑکا ہے"  # The text you want to classify

# Preprocess the text
preprocessed_text = preprocess(text)

# Tokenize and pad the preprocessed text
text_tokens = tokenizer.texts_to_sequences([preprocessed_text])
text_tokens_padded = pad_sequences(text_tokens, maxlen=max_sequence_length, padding='post')

# Make prediction
prediction_probs = model.predict(text_tokens_padded)
predicted_class_index = prediction_probs.argmax(axis=1)[0]

# Map the predicted class index to the corresponding label
predicted_class = label_encoder.inverse_transform([predicted_class_index])[0]

# Print the predicted class
print("Predicted Class:", predicted_class)


Predicted Class: Offensive Language


In [51]:
text = preprocess( " ایک عورت کے طور پر آپ کو اپنے گھر کی صفائی کے بارے میں شکایت نہیں کرنی چاہیے۔ ایک آدمی ہونے کے ناطے آپ کو ہمیشہ کچرا اٹھانا چاہیے...")


In [52]:
sequences = tokenizer.texts_to_sequences([text])

In [53]:
sequences = tokenizer.texts_to_sequences([text])

In [54]:
# Make predictions
prediction_probs = model.predict(X_test_sequences)

# Convert probabilities to predicted labels
y_pred = prediction_probs.argmax(axis=1)

predicted_class = label_encoder.inverse_transform(prediction_probs.argmax(axis=1))




In [55]:
# if "Hate Speech" in predicted_class:
#     print("The text is considered hateful.")
# else:
#     print("The text is not considered hateful.")
if "Hate Speech" in predicted_class:
    print("The text is considered hateful.")
elif "Offensive Language" in predicted_class:
    print("The text is considered offensive.")
elif "No hate and Offensive Speech" in predicted_class:
    print("The text is not considered hateful or offensive.")
else:
    print("Unknown class.")


The text is considered offensive.
