In [1]:
from AbusiveLanguageDetection.CheckForAbusiveWords import *
from SwearingDetection.CheckForSwearWords import *
from TabooWordDetection.CheckForTabooWords import *

'''Global paths and parameters'''
swear_words_file = "SwearingDetection/common/swear_words.txt"
swear_words_pos_file = "SwearingDetection/common/swear_words_pos.txt"
taboo_words_file = "TabooWordDetection/common/taboo_words.json"

LEXICAL_THRESHOLD = 30
TYPE_SCORE = 3


class DetectSexualHarassment():

    def __init__(self):
        self.model = DetectSexualHarassment

    def get_swearing_score(self, text):
        swear_words_pos = read_txt_list(swear_words_pos_file)
        swear_words = read_txt_list(swear_words_file)
        is_swearing_text, is_swearing_pos = check_swearing(text, swear_words_pos, swear_words)
        return get_swear_scores(is_swearing_text, is_swearing_pos)

    def get_taboo_score(self, text):
        taboo_words = read_json_file(taboo_words_file)
        df = create_df(taboo_words)
        taboo_words_pos, taboo_words = get_possible_nlp(df)
        is_taboo_text, is_taboo_pos = check_taboowords(text, taboo_words_pos, taboo_words)
        return get_taboo_scores(is_taboo_text, is_taboo_pos)

    def get_abusive_score(self, text):
        abuse_words = read_txt_list(swear_words_file)
        abusive_text = fetch_abusive_text(text, abuse_words)
        is_abusive_text = is_abusive(text, abusive_text)
        return get_abusive_scores(is_abusive_text)

    def collect_all_scores(self, text):
        lexical_score, type_score = 0, 0
        all_scores = [self.model.get_taboo_score(self, text),
                      self.model.get_swearing_score(self, text),
                      self.model.get_abusive_score(self, text)]
        for scores in all_scores:
            lexical_score = lexical_score + scores[0]
            type_score = type_score + scores[1]
        return lexical_score, type_score

    def check_sh_label(self, TL, TT):
        label = False
        if TL >= LEXICAL_THRESHOLD and TT >= TYPE_SCORE:
            label = True
        elif TL > LEXICAL_THRESHOLD and TT < TYPE_SCORE:
            label = False
        elif TL < LEXICAL_THRESHOLD and TT > TYPE_SCORE:
            label = False
        return label

    def detect_sexual_harassment(self, text):
        sexual_harassment = False
        if type(text) == str:
            TL, TT = self.model.collect_all_scores(self, text)
            sexual_harassment = self.model.check_sh_label(self, TL, TT)
        if type(text) == list:
            TL, TT = 0, 0
            for t in text:
                tl, tt = self.model.collect_all_scores(self, t)
                TL = TL + tl
                TT = TT + tt
            sexual_harassment = self.model.check_sh_label(self, TL, TT)
        return sexual_harassment

In [2]:
model = DetectSexualHarassment()
print(model.detect_sexual_harassment("You are such a pig"))
print(model.detect_sexual_harassment(["I don't give a fucking shit.", " Go to hell you motherfucker"]))

False
True


In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('./classified_tweets.csv')


    text = Tweets
    suspicious = 1 if suspicious and 0 if otherwise
    cyberbullying = 1 for racism, 2 for sexism and 0 if neither
    hate = 1 for hate text and 0 if otherwise
    suicidal = 1 for text with suicidal intent and 0 if otherwise


In [5]:
df.head()

Unnamed: 0,text,suspicious,cyberbullying,hate,suicidal
0,Uhmm like 6th grade on a corner of a street....,0,0,0,0
1,a) JTP is a douchebag b) Stewart kicks ass!,1,0,0,0
2,ditto bitch!,1,0,0,0
3,damn I have to drive my dad to the airport tha...,0,0,0,0
4,:],0,0,0,0


In [7]:
df[(df['cyberbullying']==2)|(df['cyberbullying']==0)]

Unnamed: 0,text,suspicious,cyberbullying,hate,suicidal
0,Uhmm like 6th grade on a corner of a street....,0,0,0,0
1,a) JTP is a douchebag b) Stewart kicks ass!,1,0,0,0
2,ditto bitch!,1,0,0,0
3,damn I have to drive my dad to the airport tha...,0,0,0,0
4,:],0,0,0,0
...,...,...,...,...,...
19929,Happy birthday @paul_lander!,0,0,0,0
19930,RT @MilradJalen: @raesanni I agree it’s awful ...,1,0,0,0
19931,I was called in yesterday at the Guidance Coun...,1,0,0,0
19932,@Jeicky_the_cat @skwi69 Thanks,0,0,0,0


In [9]:
df = df[['text','cyberbullying']]

In [10]:
df.head()

Unnamed: 0,text,cyberbullying
0,Uhmm like 6th grade on a corner of a street....,0
1,a) JTP is a douchebag b) Stewart kicks ass!,0
2,ditto bitch!,0
3,damn I have to drive my dad to the airport tha...,0
4,:],0


In [None]:
df['cyberbullying'] = df['cyberbullying'].map(lambda x : True if (x>0) else False)
df.columns = ['text','target']

In [13]:
df['target'].value_counts()

False    17256
True      2678
Name: target, dtype: int64

In [14]:
import nltk

In [15]:
ps = nltk.tokenize.PunktSentenceTokenizer()

In [18]:
pred = []
for text in df['text'].values:
    Text = ps.tokenize(text)
    test = model.detect_sexual_harassment(Text)
    pred.append(test)

In [19]:
from sklearn.metrics import classification_report,confusion_matrix

In [20]:
print(classification_report(df['target'].values,pred))

              precision    recall  f1-score   support

       False       0.86      0.71      0.78     17256
        True       0.12      0.26      0.17      2678

    accuracy                           0.65     19934
   macro avg       0.49      0.48      0.47     19934
weighted avg       0.76      0.65      0.69     19934



In [21]:
print(confusion_matrix(df['target'].values,pred))

[[12166  5090]
 [ 1974   704]]


## without tokenizing

In [25]:
pred = []
for text in df['text'].values:
    test = model.detect_sexual_harassment(text)
    pred.append(test)

In [26]:
print(classification_report(df['target'].values,pred))

              precision    recall  f1-score   support

       False       0.87      1.00      0.93     17256
        True       0.00      0.00      0.00      2678

    accuracy                           0.87     19934
   macro avg       0.43      0.50      0.46     19934
weighted avg       0.75      0.87      0.80     19934



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
print(confusion_matrix(df['target'].values,pred))

[[17256     0]
 [ 2678     0]]


## model performance not good.