<a href="https://colab.research.google.com/github/ruitenbeek/thesis/blob/main/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import os
os.chdir('/content/gdrive/My Drive/thesis/code')
!pwd

/content/gdrive/My Drive/thesis/code


In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from csv import DictReader
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
def read_file(file):
    data = list()
    abu_count = 0
    off_count = 0
    not_count = 0
    with open(file, 'r') as f:
        reader = DictReader(f, delimiter='\t')
        for row in reader:
            if (row['abusive'] == 'NOT' or row['abusive'] == 'UNKNOWN') and (row['explicitness'] == 'IMPLICIT' or row['explicitness'] == 'EXPLICIT'):
                data.append([row['text'], 'OFF'])
                off_count += 1
            elif row['abusive'] == 'IMPLICIT' or row['abusive'] == 'EXPLICIT':
                data.append([row['text'], 'ABU'])
                abu_count += 1
            elif (row['abusive'] == 'NOT' or row['abusive'] == 'UNKNOWN') and row['explicitness'] == 'NOT':
                data.append([row['text'], 'NOT'])
                not_count += 1
    print(f'ABU: %i\nOFF: %i\nNOT: %i' % (abu_count, off_count, not_count))
    data_df = pd.DataFrame(data)
    data_df.columns = ['text', 'label']
    return data_df

In [None]:
def read_lex(file):
    abu_words = list()
    with open(file, 'r') as f:
        reader = DictReader(f, delimiter='\t')
        for row in reader:
            abu_words.append(row['lemma'])
    return set(abu_words)

In [None]:
def find_lexwords(df, lex):
    lemmatizer = WordNetLemmatizer()
    abu_lextweets = 0
    off_lextweets = 0
    not_lextweets = 0
    abu_df, not_df, off_df = df.groupby('label')
    for text in abu_df[1]['text']:
        tokens = word_tokenize(text)
        for token in tokens:
            if lemmatizer.lemmatize(token) in lex:
                abu_lextweets += 1
                break
    for text in not_df[1]['text']:
        tokens = word_tokenize(text)
        for token in tokens:
            if lemmatizer.lemmatize(token) in lex:
                not_lextweets += 1
                break
    for text in off_df[1]['text']:
        tokens = word_tokenize(text)
        for token in tokens:
            if lemmatizer.lemmatize(token) in lex:
                off_lextweets += 1
                break
    print(f'Lex word Tweets ABU: %i\nLex word Tweets OFF: %i\nLex word Tweets NOT: %i\n' % (abu_lextweets, off_lextweets, not_lextweets))

In [None]:
train_data = read_file('train_final_pp.csv')
dev_data = read_file('dev_final_pp.csv')
test_data = read_file('test_final_pp.csv')
lex = read_lex('groflex.tsv')

print('\nTrain')
find_lexwords(train_data, lex)
print('\nDev')
find_lexwords(dev_data, lex)
print('\nTest')
find_lexwords(test_data, lex)

ABU: 1143
OFF: 1445
NOT: 5176
ABU: 110
OFF: 76
NOT: 361
ABU: 637
OFF: 399
NOT: 2072

Train
Lex word Tweets ABU: 341
Lex word Tweets OFF: 176
Lex word Tweets NOT: 68


Dev
Lex word Tweets ABU: 37
Lex word Tweets OFF: 9
Lex word Tweets NOT: 5


Test
Lex word Tweets ABU: 222
Lex word Tweets OFF: 63
Lex word Tweets NOT: 28

