In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score


In [3]:
# Load Abusive Language Detection Dataset
data1 = pd.read_csv('abusive tweets/abusive language detection/Tweets.csv',header=None)

In [4]:
data1.columns = ['label', 'tweet']
print(data1.columns)

Index(['label', 'tweet'], dtype='object')


In [5]:
data1.head()

Unnamed: 0,label,tweet
0,0,Only among Muslims can one find someone proudl...
1,0,@_sabanaqvi Only among Muslims can one find so...
2,0,"@megha_writes Muslim rapist ?"", ""truncated"": f..."
3,0,@peoplepower @ACLU A Muslim holding a placard ...
4,0,RT @Pad_Ban: Bohemians PC. Home to AFA thugs a...


In [6]:
data1.dtypes

label     int64
tweet    object
dtype: object

In [7]:
# Convert label to categorical type
data1['label'] = data1['label'].astype('category')

# Display the unique categories
print("Unique categories in the label column:", data1['label'].unique())

Unique categories in the label column: [0, 1, 2, 3, 4]
Categories (5, int64): [0, 1, 2, 3, 4]


In [8]:
# Split data into features (X) and target (y)
X = data1['tweet']
y = data1['label']

# Vectorize text data
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X)
print("\n=== Logistic Regression ===")

# Initialize Logistic Regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Perform cross-validation
cv_scores = cross_val_score(model, X_vectorized, y, cv=5)

# Display cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



=== Logistic Regression ===
Cross-validation scores: [0.41550191 0.64972469 0.70478611 0.57983905 0.44260906]
Mean cross-validation score: 0.5584921643371452

Accuracy: 0.6937738246505718

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.78      0.79       478
           1       0.55      0.54      0.54       474
           2       0.88      0.83      0.86       480
           3       0.63      0.69      0.66       461
           4       0.63      0.62      0.62       468

    accuracy                           0.69      2361
   macro avg       0.70      0.69      0.69      2361
weighted avg       0.70      0.69      0.70      2361



In [9]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove special characters, URLs, and mentions
    text = re.sub(r'http\S+|www\S+|@[^\s]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into text
    text = ' '.join(tokens)
    
    return text


In the background, this function takes raw text as input and applies a series of text preprocessing steps to clean and normalize the text data. These steps include converting text to lowercase, removing special characters, URLs, and mentions, tokenizing the text into words, removing stop words, lemmatizing words, and finally joining the processed tokens back into text format. This clean and preprocessed text can then be used for further analysis or modeling tasks.


In [10]:
# Preprocess the tweet text
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sivac\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Initialize model
nb_model = MultinomialNB()

# Perform cross-validation
cv_scores = cross_val_score(nb_model, X_vectorized, y, cv=5)
print("\n=== Naive Bayes (MultinomialNB) ===")
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Train the model
nb_model.fit(X_train, y_train)

# Predictions
y_pred = nb_model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



=== Naive Bayes (MultinomialNB) ===
Cross-validation scores: [0.33883947 0.6154172  0.69589157 0.54807285 0.40152478]
Mean cross-validation score: 0.51994917407878

Accuracy: 0.6552308343922066

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.80      0.72       478
           1       0.51      0.54      0.53       474
           2       0.88      0.76      0.82       480
           3       0.59      0.68      0.63       461
           4       0.71      0.48      0.57       468

    accuracy                           0.66      2361
   macro avg       0.67      0.65      0.65      2361
weighted avg       0.67      0.66      0.65      2361



In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Initialize model
svm_model = SVC(probability=True)

# Perform cross-validation
cv_scores = cross_val_score(svm_model, X_vectorized, y, cv=5)
print("\n=== Support Vector Machine (SVM) ===")
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Predictions
y_pred = svm_model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



=== Support Vector Machine (SVM) ===
Cross-validation scores: [0.4180432  0.66073698 0.71452774 0.59042778 0.44176197]
Mean cross-validation score: 0.5650995340957221

Accuracy: 0.7132570944515036

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.79      0.80       478
           1       0.56      0.61      0.58       474
           2       0.93      0.83      0.88       480
           3       0.67      0.70      0.68       461
           4       0.64      0.63      0.63       468

    accuracy                           0.71      2361
   macro avg       0.72      0.71      0.72      2361
weighted avg       0.72      0.71      0.72      2361



In [13]:
#tweet_text = 'Only among Muslims can one find someone proudly cooing that a rapist, murderer, and misogynist'
#tweet_text = "I love spending time with my family. #happy"
#tweet_text = "I can't believe he said that to me. So rude!"
tweet_text = "fuck you slut "

preprocessed_tweet = preprocess_text(tweet_text)

# Vectorize the preprocessed tweet using the same vectorizer used during training
vectorized_tweet = vectorizer.transform([preprocessed_tweet])

# Make prediction probabilities on the vectorized tweet
predicted_probabilities = svm_model.predict_proba(vectorized_tweet)[0]

# Get the category with the highest probability
predicted_category_index = np.argmax(predicted_probabilities)

# Map predicted category index to its corresponding label reflecting the level of abuse
category_mapping = {
    4: 'No Abuse',
    3: 'Low Level of Abuse',
    2: 'Moderate Level of Abuse',
    1: 'High Level of Abuse',
    0: 'Very High Level of Abuse'
}
predicted_label = category_mapping[predicted_category_index]

# Print the predicted label
print("Predicted level of abuse:", predicted_label)


Predicted level of abuse: Moderate Level of Abuse


In [14]:
from joblib import dump

# Save the trained SVM model
dump(svm_model, 'svm_model.joblib')


['svm_model.joblib']