In [1]:
from nltk.util import pr
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
import joblib
import re
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword = set(stopwords.words('english'))
#data["tweet"] = data["tweet"].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split() if word.lower() not in stopword]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/fenago/datasets/main/twitter.csv")
print(data.head())


   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


In [3]:
data["labels"] = data["class"].map({0: "Hate Speech", 1: "Offensive Language", 2: "No Hate and Offensive"})
print(data.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet                 labels  
0  !!! RT @mayasolovely: As a woman you shouldn't...  No Hate and Offensive  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...     Offensive Language  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...     Offensive Language  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...     Offensive Language  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...     Offensive Language  


In [4]:
data = data[["tweet", "labels"]]
print(data.head())

                                               tweet                 labels
0  !!! RT @mayasolovely: As a woman you shouldn't...  No Hate and Offensive
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...     Offensive Language
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...     Offensive Language
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...     Offensive Language
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...     Offensive Language


In [5]:
hate_count = data["labels"].value_counts().get("Hate Speech", 0)

print(f"Number of data with Hate Speech': {hate_count}")

Number of data with Hate Speech': 1430


In [6]:
offensive_count = data["labels"].value_counts().get("Offensive Language", 0)

print(f"Number of data with Offensive Language': {offensive_count}")

Number of data with Offensive Language': 19190


In [7]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["tweet"] = data["tweet"].apply(clean)

In [8]:
x = np.array(data["tweet"])
y = np.array(data["labels"])

cv = CountVectorizer()
X = cv.fit_transform(x) # Fit the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

joblib.dump(clf, 'decision_tree_model.joblib')

['decision_tree_model.joblib']

In [9]:
y_pred = clf.predict(X_test)

In [10]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Confusion Matrix:
 [[ 165   36  264]
 [  35 1122  222]
 [ 227  222 5886]]


In [11]:
class_report = classification_report(y_test, y_pred)
print("\nClassification Report:\n", class_report)


Classification Report:
                        precision    recall  f1-score   support

          Hate Speech       0.39      0.35      0.37       465
No Hate and Offensive       0.81      0.81      0.81      1379
   Offensive Language       0.92      0.93      0.93      6335

             accuracy                           0.88      8179
            macro avg       0.71      0.70      0.70      8179
         weighted avg       0.87      0.88      0.88      8179



In [12]:
x = np.array(data["tweet"])
y = np.array(data["labels"])

cv = CountVectorizer()
X = cv.fit_transform(x) # Fit the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

In [13]:
sample = "We will kill you"
data = cv.transform([sample]).toarray()
print(clf.predict(data))

['Hate Speech']


In [14]:
sample = "bitch you"
data = cv.transform([sample]).toarray()
print(clf.predict(data))

['Offensive Language']


In [15]:
sample = "practice love and patience to live a good life!"
data = cv.transform([sample]).toarray()
print(clf.predict(data))

['No Hate and Offensive']


In [16]:
loaded_model = joblib.load('decision_tree_model.joblib')

In [17]:
predictions = loaded_model.predict(X_test)

In [18]:
!pip install flask




In [19]:
from flask import Flask, request, jsonify
import joblib


In [20]:
clf = joblib.load('decision_tree_model.joblib')


In [21]:
app = Flask(__name__)


In [22]:
@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    text = data['text']
    
    # Preprocess the input (use the same preprocessing as you did during training)
    # For example, you can apply CountVectorizer on the input text
    input_text = cv.transform([text])

    # Make prediction using the loaded model
    prediction = clf.predict(input_text)[0]

    return jsonify({'prediction': prediction})


In [None]:
if __name__ == '__main__':
    app.run(port=5000)


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [04/Feb/2024 19:16:11] "GET / HTTP/1.1" 404 -
127.0.0.1 - - [04/Feb/2024 19:16:11] "GET /favicon.ico HTTP/1.1" 404 -
