<a href="https://colab.research.google.com/github/ruitenbeek/thesis/blob/main/2step_lr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
os.chdir('/content/gdrive/My Drive/thesis/code')
!pwd

/content/gdrive/My Drive/thesis/code


#Import

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from csv import DictReader
import pandas as pd
from sklearn.metrics import classification_report

#Read Files

In [None]:
def read_train_file(file):
    data1 = list()
    data2 = list()
    abu_count = 0
    off_count = 0
    not_count = 0
    with open(file, 'r') as f:
        reader = DictReader(f, delimiter='\t')
        for row in reader:
            if (row['abusive'] == 'NOT' or row['abusive'] == 'UNKNOWN') and (row['explicitness'] == 'IMPLICIT' or row['explicitness'] == 'EXPLICIT'):
                data1.append([row['text'], 'OFF'])
                data2.append([row['text'], 'OFF'])
                off_count += 1
            elif row['abusive'] == 'IMPLICIT' or row['abusive'] == 'EXPLICIT':
                data1.append([row['text'], 'OFF'])
                data2.append([row['text'], 'ABU'])
                abu_count += 1
            elif (row['abusive'] == 'NOT' or row['abusive'] == 'UNKNOWN') and row['explicitness'] == 'NOT':
                data1.append([row['text'], 'NOT'])
                not_count += 1
    print(f'ABU: %i\nOFF: %i\nNOT: %i' % (abu_count, off_count, not_count))
    data1_df = pd.DataFrame(data1)
    data1_df.columns = ['text', 'label']
    data2_df = pd.DataFrame(data2)
    data2_df.columns = ['text', 'label']
    return data1_df, data2_df


In [None]:
def read_test_file(file):
    data1 = list()
    data2 = list()
    abu_count = 0
    off_count = 0
    not_count = 0
    with open(file, 'r') as f:
        reader = DictReader(f, delimiter='\t')
        for row in reader:
            if (row['abusive'] == 'NOT' or row['abusive'] == 'UNKNOWN') and (row['explicitness'] == 'IMPLICIT' or row['explicitness'] == 'EXPLICIT'):
                data1.append([row['text'], 'OFF'])
                data2.append([row['text'], 'OFF'])
                off_count += 1
            elif row['abusive'] == 'IMPLICIT' or row['abusive'] == 'EXPLICIT':
                data1.append([row['text'], 'OFF'])
                data2.append([row['text'], 'ABU'])
                abu_count += 1
            elif (row['abusive'] == 'NOT' or row['abusive'] == 'UNKNOWN') and row['explicitness'] == 'NOT':
                data1.append([row['text'], 'NOT'])
                data2.append([row['text'], 'NOT'])
                not_count += 1
    print(f'ABU: %i\nOFF: %i\nNOT: %i' % (abu_count, off_count, not_count))
    data1_df = pd.DataFrame(data1)
    data1_df.columns = ['text', 'label']
    data2_df = pd.DataFrame(data2)
    data2_df.columns = ['text', 'label']
    return data1_df, data2_df

#Split Labels

In [None]:
def split_labels(data_df):
    data_X = data_df.text.tolist()
    data_y = data_df.label.tolist()
    return data_X, data_y

#Model

In [None]:
def lr_model1(train_X, train_y):
    model = LogisticRegression(random_state=0, C=20, fit_intercept=False) 
    model.fit(train_X, train_y)
    return model

In [None]:
def lr_model2(train_X, train_y):
    model = LogisticRegression(random_state=1, C=30, max_iter=200, fit_intercept=False)
    model.fit(train_X, train_y)
    return model

#Evaluation

In [None]:
def evaluation(model1, model2, test_X1, test_y1, test_y2):
    pred_y1 = model1.predict(test_X1)
    print(classification_report(test_y1, pred_y1, target_names=['NOT', 'OFF'], digits=2))
    for i in range(len(pred_y1)):
        if pred_y1[i] == 'OFF':
            [pred_y1[i]] = model2.predict(test_X1[i])
    target_names = ['ABU', 'NOT', 'OFF']
    print(classification_report(test_y2, pred_y1, target_names=target_names, digits=2))

# Main

In [None]:
print('###TRAIN Split###')
train_data1, train_data2 = read_train_file('train_final_pp.csv')
print('\n###DEV Split###')
dev_data1, dev_data2 = read_test_file('dev_final_pp.csv')
print('\n###TEST Split###')
test_data1, test_data2 = read_test_file('test_final_pp.csv')
train_X1, train_y1 = split_labels(train_data1)
train_X2, train_y2 = split_labels(train_data2)
dev_X1, dev_y1 = split_labels(dev_data1)
dev_X2, dev_y2 = split_labels(dev_data2)
test_X1, test_y1 = split_labels(test_data1)
test_X2, test_y2 = split_labels(test_data2)

all_tweets = train_X1 + dev_X1 + test_X1 
vectorizer = TfidfVectorizer()
#vectorizer = CountVectorizer()
vectorizer.fit(all_tweets)
train_vectors1 = vectorizer.transform(train_X1)
dev_vectors1 = vectorizer.transform(dev_X1)
test_vectors1 = vectorizer.transform(test_X1)
train_vectors2 = vectorizer.transform(train_X2)

model1 = lr_model1(train_vectors1, train_y1)
model2 = lr_model2(train_vectors2, train_y2)
print('\n####DEV RESULTS####')
evaluation(model1, model2, dev_vectors1, dev_y1, dev_y2)
print('\n####TEST RESULTS####')
evaluation(model1, model2, test_vectors1, test_y1, test_y2)

###TRAIN Split###
ABU: 1143
OFF: 1445
NOT: 5176

###DEV Split###
ABU: 110
OFF: 76
NOT: 361

###TEST Split###
ABU: 637
OFF: 399
NOT: 2072

####DEV RESULTS####
              precision    recall  f1-score   support

         NOT      0.857     0.848     0.852       361
         OFF      0.711     0.726     0.718       186

    accuracy                          0.806       547
   macro avg      0.784     0.787     0.785       547
weighted avg      0.807     0.806     0.807       547

              precision    recall  f1-score   support

         ABU      0.621     0.536     0.576       110
         NOT      0.857     0.848     0.852       361
         OFF      0.400     0.500     0.444        76

    accuracy                          0.737       547
   macro avg      0.626     0.628     0.624       547
weighted avg      0.746     0.737     0.740       547


####TEST RESULTS####
              precision    recall  f1-score   support

         NOT      0.821     0.888     0.853      2072
   