In [36]:
import numpy as np
import pandas as pd
import spacy
import nltk
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding
from sklearn import metrics


nlp = spacy.load("en_core_web_sm")

In [21]:
train_data_file = '/content/drive/MyDrive/train-data-prepared.json'
val_data_file = '/content/drive/MyDrive/val-data-prepared.json'

with open(train_data_file, 'r') as f:
  json_data_train = json.load(f)

with open(val_data_file, 'r') as f:
  json_data_val = json.load(f)


init_data_train = pd.json_normalize(json_data_train, record_path='preceding_posts',meta= ['id','label'],max_level=1, record_prefix='_')
init_data_val = pd.json_normalize(json_data_val, record_path='preceding_posts',meta= ['id','label'],max_level=1, record_prefix='_')

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [22]:
data_train = init_data_train[['id','_body','label']]
data_val = init_data_val[['id','_body','label']]
display(data_train)
display(data_val)

Unnamed: 0,id,_body,label
0,t1_dggp3q9,"...because it's illegal in our reality, vs. th...",1
1,t1_dggp3q9,i live in a nation were it is completely legal...,1
2,t1_dk3zd9h,Because making prostitution legal makes it ver...,0
3,t1_dk3zd9h,"I'd be interested in reading up on this, do yo...",0
4,t1_d86bsqs,Why are you linking Wikipedia and not direct t...,1
...,...,...,...
3867,t1_ch7503g,"Wow thanks for the help.\n\nOne question, can ...",0
3868,t1_denmvjy,Okay buddy.,1
3869,t1_denmvjy,"Shrug it off all you want, it's a simple fact....",1
3870,t1_crtmi2e,"It is a thing, I'm not sure if it's universal ...",0


Unnamed: 0,id,_body,label
0,t1_dipwvtv,>At this point it seems clear that we have ver...,1
1,t1_dipwvtv,>This is a disgusting attitude that glorifies ...,1
2,t1_dctegi4,The black community and LGBTQ rights is a grea...,0
3,t1_dctegi4,Thank you for taking the time to share your pe...,0
4,t1_d4vri90,So what?,1
...,...,...,...
511,t1_cpcigu7,But sexual harassment is also not a bathroom p...,0
512,t1_cnu1fi5,We think the government is this thing imposing...,1
513,t1_cnu1fi5,>We think the government is this thing imposin...,1
514,t1_dhdub9z,"Can you elaborate, what are ""gun free zones"" i...",0


In [23]:
def PreprocessData(df):
  df['clean_text'] = df['_body'].str.lower()
  df['clean_text'] = df['clean_text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')
  df['clean_text'] = df['clean_text'].str.strip()
  df['clean_text'] = df['clean_text'].str.replace('[^\w\s]','')
  return df

In [24]:
df_train = PreprocessData(data_train)
df_val = PreprocessData(data_val)
print(df_train.head(6))

           id  ...                                         clean_text
0  t1_dggp3q9  ...  because its illegal in our reality vs the prop...
1  t1_dggp3q9  ...  i live in a nation were it is completely legal...
2  t1_dk3zd9h  ...  because making prostitution legal makes it ver...
3  t1_dk3zd9h  ...  id be interested in reading up on this do you ...
4  t1_d86bsqs  ...  why are you linking wikipedia and not direct t...
5  t1_d86bsqs  ...  i put the wiki because i figured you may want ...

[6 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [19]:
def ConcatenatePost(df):
  df['clean_text_copy'] = df.groupby(['id'])['clean_text'].transform(lambda x : ' '.join(x))
  df['clean_text'] = df.groupby(['id'])['clean_text'].transform(lambda x : '||'.join(x))
  df['_body'] = df.groupby(['id'])['_body'].transform(lambda x : '|'.join(x))
  df['label'] = df.groupby(['id'])['label'].transform('max')
  df = df.drop_duplicates(inplace=False)
  return df


In [25]:
df_train = ConcatenatePost(df_train)
df_val = ConcatenatePost(df_val)
# delete later
display(df_val)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

Unnamed: 0,id,_body,label,clean_text,clean_text_copy
0,t1_dipwvtv,>At this point it seems clear that we have ver...,1,at this point it seems clear that we have very...,at this point it seems clear that we have very...
2,t1_dctegi4,The black community and LGBTQ rights is a grea...,0,the black community and lgbtq rights is a grea...,the black community and lgbtq rights is a grea...
4,t1_d4vri90,So what? |What do you imagine being a 'protect...,1,so what||what do you imagine being a protected...,so what what do you imagine being a protected ...
6,t1_d2v90lz,The US does not want free migration. In fact w...,0,the us does not want free migration in fact we...,the us does not want free migration in fact we...
8,t1_dd1k4g6,You didn't talk about the crowd size because y...,1,you didnt talk about the crowd size because yo...,you didnt talk about the crowd size because yo...
...,...,...,...,...,...
506,t1_cgmqm3y,"I hear the ""equality of opportunity"" rhetoric ...",0,i hear the equality of opportunity rhetoric qu...,i hear the equality of opportunity rhetoric qu...
508,t1_cvoj5re,">So obviously, the realistic solution to solve...",1,so obviously the realistic solution to solve t...,so obviously the realistic solution to solve t...
510,t1_cpcigu7,"Again, sexual harassment is not a ""US"" problem...",0,again sexual harassment is not a us problem th...,again sexual harassment is not a us problem th...
512,t1_cnu1fi5,We think the government is this thing imposing...,1,we think the government is this thing imposing...,we think the government is this thing imposing...


In [26]:
print(df_val['clean_text_copy'][0])

at this point it seems clear that we have very different definitions of bigotry but i wont get into that because semantics are rarely a fruitful exercise

you mean you wont get into it because then you would have to admit that you are defending the right of bigots to engage in prejudicial treatment of others and that is morally indefensible

its not that i think life should be fair to the bigot its that i will resist any attempt to introduce government coercion in the free market people should be allowed to run businesses as they please and if those businesses are run by shitty people you can take your money elsewhere if a business with horrible practices is able to thrive that is more indicative of a problem with society not just the business owner

this is a disgusting attitude that glorifies those with power over those who have none  this is the heart of fascism power to the strong fuck the weak

i sincerely hope that you are made the victim of prejudice and bigotry so that you can 

In [31]:
max_vocab = 2500
tokenizer = Tokenizer(num_words = max_vocab,split = ' ', oov_token=1 )

In [33]:
tokenizer.fit_on_texts(df_train['clean_text_copy'].values)
x_train_neural =  tokenizer.texts_to_sequences(df_train['clean_text_copy'].values)
x_val_neural = tokenizer.texts_to_sequences(df_val['clean_text_copy'])

max_length = max(len(s.split()) for s in df_train['clean_text_copy'].values)
x_train_neural = pad_sequences(x_train_neural,maxlen=max_length)
x_val_neural = pad_sequences(x_val_neural,maxlen=max_length)

y_train = df_train['label'].values
y_val = df_val['label'].values

In [34]:
print(x_train_neural.shape)
print(x_train_neural)

print(x_val_neural.shape)
print(x_val_neural)

(1936, 3033)
[[   0    0    0 ...   72    2  119]
 [   0    0    0 ...    1   18    1]
 [   0    0    0 ...    2 2075   50]
 ...
 [   0    0    0 ...   12   43    6]
 [   0    0    0 ...    1    9   13]
 [   0    0    0 ...   27  153    1]]
(258, 3033)
[[   0    0    0 ...    8   14  175]
 [   0    0    0 ... 1572   11  710]
 [   0    0    0 ...    5  274  624]
 ...
 [   0    0    0 ...    2 1461  155]
 [   0    0    0 ...    5    1   12]
 [   0    0    0 ... 1415    1  188]]


In [13]:
embed_dim = 32
lstm_out = 50

model = Sequential()
model.add(Embedding(max_vocab,embed_dim,input_length = x_train_neural.shape[1]))
# model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out,dropout = 0.2,recurrent_dropout = 0.2))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])



In [14]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 3033, 32)          80000     
_________________________________________________________________
lstm (LSTM)                  (None, 50)                16600     
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
Total params: 96,651
Trainable params: 96,651
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
print(x_train_neural.shape)
print(y_train.shape)

(1936, 3033)
(1936,)


In [16]:
model.fit(x_train_neural, y_train, epochs = 7, batch_size = 64, verbose = 2)

Epoch 1/7
31/31 - 340s - loss: 0.6926 - accuracy: 0.5165
Epoch 2/7
31/31 - 333s - loss: 0.6554 - accuracy: 0.6544
Epoch 3/7
31/31 - 333s - loss: 0.5228 - accuracy: 0.7624
Epoch 4/7
31/31 - 328s - loss: 0.4126 - accuracy: 0.8337
Epoch 5/7
31/31 - 326s - loss: 0.3126 - accuracy: 0.8786
Epoch 6/7
31/31 - 329s - loss: 0.2724 - accuracy: 0.8951
Epoch 7/7
31/31 - 330s - loss: 0.2242 - accuracy: 0.9220


<tensorflow.python.keras.callbacks.History at 0x7f186e88d5d0>

In [30]:
max_length = max(len(s.split()) for s in df_train['clean_text_copy'].values)
print(max_length)

3033


In [42]:
predictions = model.predict(x_val_neural)

In [46]:
print(metrics.f1_score(y_val,predictions_final))

0.7499999999999999


In [47]:
# print(predictions)

In [44]:
predictions_final = np.where(predictions < 0.5, 0, 1)

In [48]:
# print(predictions_final)

In [49]:
display(df_train)

Unnamed: 0,id,_body,label,clean_text,clean_text_copy
0,t1_dggp3q9,"...because it's illegal in our reality, vs. th...",1,because its illegal in our reality vs the prop...,because its illegal in our reality vs the prop...
2,t1_dk3zd9h,Because making prostitution legal makes it ver...,0,because making prostitution legal makes it ver...,because making prostitution legal makes it ver...
4,t1_d86bsqs,Why are you linking Wikipedia and not direct t...,1,why are you linking wikipedia and not direct t...,why are you linking wikipedia and not direct t...
6,t1_cpzy2ya,">Real property has a ""logical absolute necessi...",0,real property has a logical absolute necessity...,real property has a logical absolute necessity...
8,t1_d92nfmh,>Are you really defending multinationals not b...,1,are you really defending multinationals not be...,are you really defending multinationals not be...
...,...,...,...,...,...
3862,t1_cpet2nu,"You are totally right, I completely flubbed it...",0,you are totally right i completely flubbed it ...,you are totally right i completely flubbed it ...
3864,t1_ck91k4x,"LOL ""ethical butcher"" is like ""clean coal"" and...",1,lol ethical butcher is like clean coal and a j...,lol ethical butcher is like clean coal and a j...
3866,t1_ch7503g,>I'm been stuck with this debate with myself f...,0,im been stuck with this debate with myself for...,im been stuck with this debate with myself for...
3868,t1_denmvjy,"Okay buddy. |Shrug it off all you want, it's a...",1,okay buddy||shrug it off all you want its a si...,okay buddy shrug it off all you want its a sim...


In [54]:
text = "Screw you off all you want, it's a simple fact. Assault rifles are select fire and the theoretical operation of a filed down sear is not"
doc = nlp(text)

for token in doc:
  print(token.text, token.pos_, token.tag_, token.dep)

Screw VERB VB 399
you PRON PRP 416
off ADP RP 444
all DET DT 416
you PRON PRP 429
want VERB VBP 447
, PUNCT , 445
it PRON PRP 429
's AUX VBZ 8206900633647566924
a DET DT 415
simple ADJ JJ 402
fact NOUN NN 404
. PUNCT . 445
Assault PROPN NNP 7037928807040764755
rifles NOUN NNS 429
are AUX VBP 8206900633647566924
select ADJ JJ 402
fire NOUN NN 404
and CCONJ CC 407
the DET DT 415
theoretical ADJ JJ 402
operation NOUN NN 410
of ADP IN 443
a DET DT 415
filed VERB VBN 402
down ADP RP 444
sear NOUN NN 439
is AUX VBZ 410
not PART RB 425
