In [34]:
import re
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('data/HateSpeechDataset.csv', usecols=['Content', 'Label'])

In [3]:
df.head()

Unnamed: 0,Content,Label
0,denial of normal the con be asked to comment o...,1
1,just by being able to tweet this insufferable ...,1
2,that is retarded you too cute to be single tha...,1
3,thought of a real badass mongol style declarat...,1
4,afro american basho,1


### Lowering the column names

In [4]:
lowered_collumns_name = [i.lower() for i in df.columns]
df.columns = lowered_collumns_name

In [5]:
duplicated_rows_count = df['content'].duplicated().sum()
if duplicated_rows_count > 0:
    df = df.drop_duplicates(subset='content', keep='first')
    print(f'{duplicated_rows_count} row of duplicated value droped')

23345 row of duplicated value droped


In [6]:
special_characters = [
    '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
    ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~',
]
pattern = r'\b[a-zA-Z]+\b'
def remove_special_chars_from_text(text):
    temp = ''.join([i if i not in special_characters else ' ' for i in str(text)])
    return ' '.join(re.findall(pattern, temp))

def remove_multiple_spaces(s):
    return ' '.join(s.split())

class Text_Util:
    @staticmethod
    def lower_text_case(text):
        if not isinstance(text, np.ndarray):
            raise Exception('entered an invalid dtype for the lower_text_case')

        if text.dtype.type is not np.str_:
            text = text.astype(str)
        return np.char.lower(text)

    @staticmethod
    def special_chars_remover(text):
        if not isinstance(text, np.ndarray):
            raise Exception('entered an invalid dtype for the remove_special_chars_from_text')
        special_chars_remover_util = np.vectorize(remove_special_chars_from_text)
        return special_chars_remover_util(text)

    @staticmethod
    def extra_white_space_remover(text):
        if not isinstance(text, np.ndarray):
            raise Exception('entered an invalid dtype for the extra_white_space_remover')
        vectorized_remove_spaces = np.vectorize(remove_multiple_spaces)
        return vectorized_remove_spaces(text)

    

class Text_Preprossesor:
    def __init__(self):
        self.content = []
        self.label_names = []
        self.pipe_line = []
        self.count_vectorizer = None

    def collect_utils(self):
        self.pipe_line.append(Text_Util.lower_text_case)
        self.pipe_line.append(Text_Util.special_chars_remover)
        self.pipe_line.append(Text_Util.extra_white_space_remover)

    def process_text(self, features):
        for func in self.pipe_line:
            features = func(features)
        return features

    def count_vectorizer_initialize(self, word_count = 10000):
        self.count_vectorizer = CountVectorizer(max_features=word_count)

    def vectorize_contet_fit(self):
        self.count_vectorizer.fit(self.content)
        self.label_names = self.count_vectorizer.get_feature_names_out()
        
        
    def fit(self, text, word_count = 10000):
        if isinstance(text, str):
            self.content = np.array([text])
        elif isinstance(text, np.ndarray):
            self.content = text
        elif isinstance(text, pd.Series):
            self.content = text.values
        else:
            raise Exception('entered an invalid dtype for the text processing')

        self.collect_utils()
        self.content = self.process_text(self.content)
        self.count_vectorizer_initialize(word_count)
        self.vectorize_contet_fit()

    def get_features(self):
        return self.count_vectorizer.transform(self.content)

    def get_label_names(self):
        return self.label_names

    def vectorize_text(self, text):
        processed_text = []
        if isinstance(text, str):
            processed_text = np.array([text])
        elif isinstance(text, np.ndarray):
            processed_text = text
        elif isinstance(text, pd.Series):
            processed_text = text.values
        else:
            raise Exception('entered an invalid dtype for the text processing')

        return self.count_vectorizer.transform(processed_text)

    

In [7]:
text_processor = Text_Preprossesor()
text_processor.fit(df.iloc[:, 0])
X = text_processor.get_features()
y = df.iloc[:, 1]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
clf = LogisticRegression(max_iter=1500)
clf.fit(X_train, y_train)

In [36]:
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
cls_report = classification_report(y_test, predictions)
print(f"Accuracy: {round(accuracy, 2) * 100}%")
print(cls_report)

Accuracy: 86.0%
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     68191
           1       0.69      0.46      0.55     15322

    accuracy                           0.86     83513
   macro avg       0.79      0.71      0.74     83513
weighted avg       0.85      0.86      0.85     83513

