In [1]:
from nltk.util import pr
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import re
import nltk
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string

In [2]:
stopword=set(stopwords.words('english'))

In [3]:
data = pd.read_csv("twitter.csv")

In [4]:
data["labels"] = data["class"].map({0: "Hate Speech", 
                                    1: "Offensive Language", 
                                    2: "Nor Hate neither offensive"})

In [5]:
data = data[["tweet", "labels"]]

In [6]:
#Cleaning text in tweets
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["tweet"] = data["tweet"].apply(clean)

In [7]:
x = np.array(data["tweet"])
y = np.array(data["labels"])

cv = CountVectorizer()
X = cv.fit_transform(x) # Fit the Data


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

In [9]:
y_pred = clf.predict(X_test)

In [10]:
# print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))
# Model evaluation
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

print("Training Classification Report:")
print(classification_report(y_train, y_pred_train))

print("Testing Classification Report:")
print(classification_report(y_test, y_pred_test))

Training Classification Report:
                            precision    recall  f1-score   support

               Hate Speech       0.99      1.00      1.00      1012
Nor Hate neither offensive       1.00      1.00      1.00      2779
        Offensive Language       1.00      1.00      1.00     12834

                  accuracy                           1.00     16625
                 macro avg       1.00      1.00      1.00     16625
              weighted avg       1.00      1.00      1.00     16625

Testing Classification Report:
                            precision    recall  f1-score   support

               Hate Speech       0.33      0.34      0.34       432
Nor Hate neither offensive       0.81      0.79      0.80      1386
        Offensive Language       0.92      0.92      0.92      6371

                  accuracy                           0.87      8189
                 macro avg       0.69      0.69      0.69      8189
              weighted avg       0.87      0.87 

In [11]:
import pickle
# from sklearn.externals import joblib
import joblib
filename = 'SD.pkl'
joblib.dump(clf, filename)

['SD.pkl']

In [12]:
# Sample prediction
sample = "Let's unite and kill all the people who are protesting against the government"
sample_cleaned = clean(sample)
sample_vectorized = cv.transform([sample_cleaned]).toarray()
prediction = clf.predict(sample_vectorized)
print("Sample Prediction:", prediction)


Sample Prediction: ['Hate Speech']
