In [99]:
import numpy as np
import pandas as pd
from ast import literal_eval
import re
import nltk
import matplotlib.pyplot as plt
import random

path = 'data/'

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline

In [2]:
trial = pd.read_csv(path + 'tsd_trial.csv')
train = pd.read_csv(path + 'tsd_train.csv')

train['spans'] = train.spans.apply(literal_eval)
trial['spans'] = trial.spans.apply(literal_eval)

In [13]:
def getword(arr, text):
    ans = ''
    for i in range(len(arr)):
        elem = arr[i]
        if  i != 0 and i != len(arr) - 1 and elem != arr[i-1] + 1:
            ans += ' '
            ans += text[elem]
        else:
            ans += text[elem]
    return ans



In [33]:
def getneutral(arr, text):
    ans = ''
    arr = set(arr)
    for i in range(len(text)):
        if i in arr:
            continue
        else:
            elem = text[i]
            ans += elem
    return ans

In [110]:
def clean(string):
    string = string.lower()
    new_string = ''
    for elem in string:
        if elem.isalpha() or elem == ' ':
            new_string += elem
    return new_string

In [121]:
def generate_ind(clean_str):
    words = clean_str.split()
    vocab = {}
    for i in range(len(words)):
        if i == 0:
            vocab[words[i]] = 0
        else:
            vocab[words[i]] = vocab[words[i - 1]] + len(words[i]) + 1
    return vocab

In [144]:
def generate_span(original, word2coef):
    clean_str = clean(original)
    word2ind = generate_ind(clean_str)
    span = []
    for word in clean_str.split():
        coef = word2coef.get(word, '')
        if coef == '':
            continue
        elif coef > 1:
            start = original.lower().find(word)
            end = start + len(word) 
            curr_span = list(range(start, end))
            span += curr_span
    return span
        
    

In [14]:
train.head()

Unnamed: 0,spans,text
0,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...",Another violent and aggressive immigrant killi...
1,"[33, 34, 35, 36, 37, 38, 39]","I am 56 years old, I am not your fucking junio..."
2,"[0, 1, 2, 3]","Damn, a whole family. Sad indeed."
3,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]",What a knucklehead. How can anyone not know th...
4,"[32, 33, 34, 35, 36, 37, 38]","""who do you think should do the killing?""\n\nA..."


In [25]:
train_toxic_words = []
for span, text in zip(train['spans'], train['text']):
    train_toxic_words.append(getword(span, text))
toxic_sentences = list([sentence.lower() for sentence in train_toxic_words if len(sentence) > 0])

In [34]:
train_neutral_words = []
for span, text in zip(train['spans'], train['text']):
    train_neutral_words.append(getneutral(span, text))
neutral_sentences = list([sentence.lower() for sentence in train_neutral_words])

In [36]:
print(neutral_sentences)



In [97]:
trial_sents = list(trial['text'])

In [83]:
with open(path + 'nontoxic_train.txt','w') as f:
    f.write('$'.join(neutral_sentences))

In [84]:
with open(path + 'toxic_train.txt', 'w') as f:
    f.write('$'.join(toxic_sentences))

In [85]:
toxic_train = open(path + 'toxic.txt').read().split('$')
neutral_train = open(path + 'nontoxic.txt').read().split('$')

In [89]:
pipe = make_pipeline(CountVectorizer(), LogisticRegression())
X_train = toxic_train + neutral_train
y_train = [1] * len(toxic_train) + [0] * len(neutral_train)
pipe.fit(X_train, y_train);



In [107]:
print(toxic_train)



In [90]:
coefs = pipe[1].coef_[0]

In [94]:
pipe[1].coef_[0]
coefs.shape

(19160,)

In [95]:
word2coef = {w : coefs[idx] for w, idx in pipe[0].vocabulary_.items()}

In [96]:
import pickle
with open('word2coef.pkl', 'wb') as f:
    pickle.dump(word2coef, f)

In [134]:
text = random.choice(trial_sents)
clean_text = clean(text)
print(text)

I don't believe you can justify your two statements. The first is plainly untrue, even idiotic. The second is also untrue to the point of idiocy.


In [135]:
for w in clean_text.split():
    print(w, '\t', word2coef.get(w, ''))

i 	 
dont 	 -0.21271989791403623
believe 	 -0.15154447514853855
you 	 -0.7516426671199784
can 	 -0.7055349061720811
justify 	 0.34428039269204797
your 	 -0.3563100402461694
two 	 -0.3127389663604293
statements 	 -0.2079485875511563
the 	 -0.3614103771307838
first 	 -0.5532274865016996
is 	 -1.0201200746438854
plainly 	 -0.0009406133511407777
untrue 	 -0.12154456368862239
even 	 -0.21160448451235087
idiotic 	 3.084645301122781
the 	 -0.3614103771307838
second 	 0.2734278448811442
is 	 -1.0201200746438854
also 	 -0.15704896488102546
untrue 	 -0.12154456368862239
to 	 -0.49969265148888153
the 	 -0.3614103771307838
point 	 0.6157340613579962
of 	 -0.3890168195813529
idiocy 	 1.7576365177663211


In [146]:
trial_spans = []
for sentence in trial.text:
    span = generate_span(sentence, word2coef)
    trial_spans.append(span)

In [147]:
print(trial_spans)

[[15, 16, 17, 18, 19], [29, 30, 31, 32, 33, 34], [50, 51, 52, 53, 54, 155, 156, 157, 158, 159, 160, 166, 167, 168, 169, 170, 171], [19, 20, 21, 22, 23, 24, 19, 20, 21, 22, 23, 24, 87, 88, 89, 90, 91, 92], [188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199], [8, 9, 10, 11, 12], [171, 172, 173, 174, 175, 176, 178, 179, 180, 181, 182, 183, 184, 186, 187, 188, 189, 190, 191, 192, 193, 171, 172, 173, 174, 175, 176, 178, 179, 180, 181, 182, 183, 184, 186, 187, 188, 189, 190, 191, 192, 193, 471, 472, 473, 474, 479, 480, 481, 482, 479, 480, 481, 482], [73, 74, 75, 76, 77], [38, 39, 40, 41, 42, 43], [288, 289, 290, 291, 292, 293, 294, 295], [26, 27, 28, 29, 30], [], [], [84, 85, 86, 87, 199, 200, 201, 202, 203, 258, 259, 260, 261, 262, 263, 521, 522, 523, 524, 525], [23, 24, 25, 26], [10, 11, 12, 13, 14, 15, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 250, 251, 252, 253, 254, 255], [51, 52, 53, 54, 55, 56, 57, 58, 59], [78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90], [31, 32

In [165]:
for sentence in trial.text.iloc[:5]:
    print(sentence)

Because he's a moron and a bigot. It's not any more complicated than that.
How about we stop protecting idiots and let nature add some bleach to the gene pool. We can always submit their names for the Darwin awards.
If people  were  smart, they would  Boycott this  inept  airline,  but   they  are  not  smart,  so   rogue  businesses  like  this  one,   still thrive   taking the idiots  for  a ride...
Trump Claimed that Russia will never invade the Ukraine, when Russia already has - how stupid can people be?
As long as your willing to pay a lot more for products you buy, then fine.
But you better not be going to Costco and Walmart to buy stuff because it's cheaper.
If so, we get to call you a hypocritical wanker.


In [166]:
final_spans = pd.Series(trial_spans)

In [167]:
import csv

Object `pd.to_csv()` not found.


In [177]:
fsp = [str(span) for span in final_spans]

In [180]:
pdfsp = pd.Series(fsp)

In [190]:
with open('results/baseline_spans.txt', 'w') as f:
    for elem in fsp:
        f.write(elem + '\n')