<a href="https://colab.research.google.com/github/ruitenbeek/thesis/blob/main/1step_lr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
os.chdir('/content/gdrive/My Drive/thesis/code')
!pwd

/content/gdrive/My Drive/thesis/code


#Import

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from csv import DictReader
import pandas as pd
from sklearn.metrics import classification_report

#Read Files

In [None]:
def read_file(file):
    data = list()
    abu_count = 0
    off_count = 0
    not_count = 0
    with open(file, 'r') as f:
        reader = DictReader(f, delimiter='\t')
        for row in reader:
            if (row['abusive'] == 'NOT' or row['abusive'] == 'UNKNOWN') and (row['explicitness'] == 'IMPLICIT' or row['explicitness'] == 'EXPLICIT'):
                data.append([row['text'], 'OFF'])
                off_count += 1
            elif row['abusive'] == 'IMPLICIT' or row['abusive'] == 'EXPLICIT':
                data.append([row['text'], 'ABU'])
                abu_count += 1
            elif (row['abusive'] == 'NOT' or row['abusive'] == 'UNKNOWN') and row['explicitness'] == 'NOT':
                data.append([row['text'], 'NOT'])
                not_count += 1
    print(f'ABU: %i\nOFF: %i\nNOT: %i' % (abu_count, off_count, not_count))
    data_df = pd.DataFrame(data)
    data_df.columns = ['text', 'label']
    return data_df


#Split Labels

In [None]:
def split_labels(data_df):
    data_X = data_df.text.tolist()
    data_y = data_df.label.tolist()
    return data_X, data_y

#Model

In [None]:
def lr_model(train_X, train_y):
    model = LogisticRegression(random_state=0, max_iter=200)
    model.fit(train_X, train_y)
    return model

#Evaluation

In [None]:
def evaluation(model, test_X, test_y):
    pred_y = model.predict(test_X)
    target_names = ['ABU', 'NOT', 'OFF']
    print(classification_report(test_y, pred_y, target_names=target_names, digits=3))

# Main

In [None]:
print('###TRAIN Split###')
train_data = read_file('train_final_pp.csv')
print('\n###DEV Split###')
dev_data = read_file('dev_final_pp.csv')
print('\n###TEST Split###')
test_data = read_file('test_final_pp.csv')
train_X, train_y = split_labels(train_data)
dev_X, dev_y = split_labels(dev_data)
test_X, test_y = split_labels(test_data)

all_tweets = train_X + dev_X + test_X 
vectorizer = TfidfVectorizer()
vectorizer.fit(all_tweets)

train_vectors = vectorizer.transform(train_X)
dev_vectors = vectorizer.transform(dev_X)
test_vectors = vectorizer.transform(test_X)

model = lr_model(train_vectors, train_y)

print('\n####DEV RESULTS####')
evaluation(model, dev_vectors, dev_y)
print('\n####TEST RESULTS####')
evaluation(model, test_vectors, test_y)

###TRAIN Split###
ABU: 1143
OFF: 1445
NOT: 5176

###DEV Split###
ABU: 110
OFF: 76
NOT: 361

###TEST Split###
ABU: 637
OFF: 399
NOT: 2072

####DEV RESULTS####
              precision    recall  f1-score   support

         ABU      0.815     0.400     0.537       110
         NOT      0.772     0.975     0.862       361
         OFF      0.595     0.289     0.389        76

    accuracy                          0.764       547
   macro avg      0.727     0.555     0.596       547
weighted avg      0.756     0.764     0.731       547


####TEST RESULTS####
              precision    recall  f1-score   support

         ABU      0.811     0.155     0.261       637
         NOT      0.741     0.984     0.845      2072
         OFF      0.270     0.158     0.199       399

    accuracy                          0.708      3108
   macro avg      0.608     0.432     0.435      3108
weighted avg      0.695     0.708     0.643      3108

