In [1]:
import numpy as np
import pandas as pd
import spacy
import nltk
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding,InputLayer,Concatenate
from tensorflow.keras import Model
from sklearn import metrics
import keras


nlp = spacy.load("en_core_web_sm")

In [2]:
def PreprocessData(df):
  df['clean_text'] = df['_body'].str.lower()
  df['clean_text'] = df['clean_text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')
  df['clean_text'] = df['clean_text'].str.strip()
  df['clean_text'] = df['clean_text'].str.replace('[^\w\s]','')
  return df

def ConcatenatePost(df):
  df['clean_text_copy'] = df.groupby(['id'])['clean_text'].transform(lambda x : ' '.join(x))
  df['clean_text'] = df.groupby(['id'])['clean_text'].transform(lambda x : '||'.join(x))
  df['_body'] = df.groupby(['id'])['_body'].transform(lambda x : '|'.join(x))
  df['label'] = df.groupby(['id'])['label'].transform('max')
  df = df.drop_duplicates(inplace=False)
  return df

def FeatureExtract(df):

  offensive_words = ["ass","idiot","moron","stupid","bitch","shit","fuck","dumb","fool","pussy"]
  advmod_exist_list = [] #intensifiers eg: absolutely, very, extremely, seriously etc
  prp_exist_list = [] #personal pronouns eg: I, me, you etc
  acomp_exist_list = [] #adjectivial complement eg: unreal, unsatisfactory, unwilling
  relcl_exist_list = [] #eg: hurts, torutres, celebrates
  abuse_exist_list = [] #bad and offensive words
  for text in df['clean_text_copy']:
    doc = nlp(text)
    advmod_exist = 0
    prp_exist = 0
    acomp_exist = 0
    relcl_exist = 0
    abuse_exist = 0
    for token in doc:
      if token.dep_ == 'advmod':
        advmod_exist = advmod_exist + 1
      
      if token.tag_ == 'PRP':
        prp_exist = prp_exist + 1
      
      if token.pos_ == 'ADJ' and token.dep_ == 'acomp':
        acomp_exist = acomp_exist + 1
      
      if token.text in offensive_words:
        abuse_exist = abuse_exist + 1
      
      if token.dep_ == 'relcl':
        relcl_exist = relcl_exist + 1
    
    advmod_exist_list.append(advmod_exist)
    prp_exist_list.append(prp_exist)
    acomp_exist_list.append(acomp_exist)
    abuse_exist_list.append(abuse_exist)
    relcl_exist_list.append(relcl_exist)
  
  df['intensifier'] = advmod_exist_list
  df['prp'] = prp_exist_list
  df['acomp'] = acomp_exist_list
  df['abuse'] = abuse_exist_list
  df['relcl'] = relcl_exist_list
  return df

In [3]:
train_data_file = '/content/drive/MyDrive/train-data-prepared.json'
val_data_file = '/content/drive/MyDrive/val-data-prepared.json'

with open(train_data_file, 'r') as f:
  json_data_train = json.load(f)

with open(val_data_file, 'r') as f:
  json_data_val = json.load(f)


init_data_train = pd.json_normalize(json_data_train, record_path='preceding_posts',meta= ['id','label'],max_level=1, record_prefix='_')
init_data_val = pd.json_normalize(json_data_val, record_path='preceding_posts',meta= ['id','label'],max_level=1, record_prefix='_')

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
data_train = init_data_train[['id','_body','label']]
data_val = init_data_val[['id','_body','label']]
display(data_train)
display(data_val)

Unnamed: 0,id,_body,label
0,t1_dggp3q9,"...because it's illegal in our reality, vs. th...",1
1,t1_dggp3q9,i live in a nation were it is completely legal...,1
2,t1_dk3zd9h,Because making prostitution legal makes it ver...,0
3,t1_dk3zd9h,"I'd be interested in reading up on this, do yo...",0
4,t1_d86bsqs,Why are you linking Wikipedia and not direct t...,1
...,...,...,...
3867,t1_ch7503g,"Wow thanks for the help.\n\nOne question, can ...",0
3868,t1_denmvjy,Okay buddy.,1
3869,t1_denmvjy,"Shrug it off all you want, it's a simple fact....",1
3870,t1_crtmi2e,"It is a thing, I'm not sure if it's universal ...",0


Unnamed: 0,id,_body,label
0,t1_dipwvtv,>At this point it seems clear that we have ver...,1
1,t1_dipwvtv,>This is a disgusting attitude that glorifies ...,1
2,t1_dctegi4,The black community and LGBTQ rights is a grea...,0
3,t1_dctegi4,Thank you for taking the time to share your pe...,0
4,t1_d4vri90,So what?,1
...,...,...,...
511,t1_cpcigu7,But sexual harassment is also not a bathroom p...,0
512,t1_cnu1fi5,We think the government is this thing imposing...,1
513,t1_cnu1fi5,>We think the government is this thing imposin...,1
514,t1_dhdub9z,"Can you elaborate, what are ""gun free zones"" i...",0


In [6]:
df_train = PreprocessData(data_train)
df_val = PreprocessData(data_val)
df_train = ConcatenatePost(df_train)
df_val = ConcatenatePost(df_val)
# delete later
# display(df_val)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [7]:
max_vocab = 2500
tokenizer = Tokenizer(num_words = max_vocab,split = ' ', oov_token=1 )

In [8]:
tokenizer.fit_on_texts(df_train['clean_text_copy'].values)
x_train_neural =  tokenizer.texts_to_sequences(df_train['clean_text_copy'].values)
x_val_neural = tokenizer.texts_to_sequences(df_val['clean_text_copy'])

max_length = max(len(s.split()) for s in df_train['clean_text_copy'].values)
x_train_neural = pad_sequences(x_train_neural,maxlen=max_length)
x_val_neural = pad_sequences(x_val_neural,maxlen=max_length)

y_train = df_train['label'].values
y_val = df_val['label'].values

print(x_train_neural.shape)
# print(x_train_neural)

# print(x_val_neural.shape)
# print(x_val_neural)

(1936, 3033)


In [9]:
# embed_dim = 32
# lstm_out = 50

# model = Sequential()
# model.add(Embedding(max_vocab,embed_dim,input_length = x_train_neural.shape[1]))
# # model.add(SpatialDropout1D(0.4))
# model.add(LSTM(lstm_out,dropout = 0.2,recurrent_dropout = 0.2))
# model.add(Dense(1,activation = 'sigmoid'))
# model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

# # print(x_train_neural.shape)
# # print(y_train.shape)

In [10]:
# print(model.summary())

In [11]:
# model.fit(x_train_neural, y_train, epochs = 7, batch_size = 64, verbose = 2)

In [12]:
# likelihoods = model.predict(x_val_neural)
# predictions_final = np.where(likelihoods < 0.5, 0, 1)

In [13]:
# print(metrics.f1_score(y_val,predictions_final))

In [14]:
# text = "this is so stupid you so are off all you want, it's a simple fact. Assault rifles are select fire and the theoretical operation of a filed down sear is not"
# text = 'Seriously, youve fucking named the virtuous movement to end oppression after women, and the evil system that celebrates people after men, and you honestly expect me to believe that you arent inherently biased when thinking about this?'
# text = 'unlike grabbing a woman that hurts by her pussy'
# doc = nlp(text)

# for token in doc:
#   print(token.text, token.pos_, token.tag_, token.dep_)


In [15]:
df_train = FeatureExtract(df_train)
df_val = FeatureExtract(df_val)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [16]:
x_train_text_features = df_train[["intensifier","prp","acomp","abuse","relcl"]].values
x_val_text_features = df_val[["intensifier","prp","acomp","abuse","relcl"]].values

In [17]:
print(x_train_text_features.shape)

(1936, 5)


In [18]:
embed_dim = 32
lstm_out = 50

model1 = Sequential()
model1.add(Embedding(max_vocab,embed_dim,input_length = x_train_neural.shape[1]))
# model.add(SpatialDropout1D(0.4))
model1.add(LSTM(lstm_out,dropout = 0.2,recurrent_dropout = 0.2))
# model1.add(Dense(1,activation = 'sigmoid'))

model2 = Sequential()
model2.add(Dense(3,activation = 'relu',input_shape = (5,)))
# model2.add(Dropout(0.2))

merged_model = Concatenate()([model1.output,model2.output])
z = Dense(10,activation = 'relu')(merged_model)
z = Dense(1,activation = 'sigmoid')(z)

model = Model(inputs=[model1.input, model2.input], outputs=z)
# optimizer = keras.optimizers.Adam(learning_rate=0.005)
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

# print(x_train_neural.shape)
# print(y_train.shape)

In [19]:
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_input (InputLayer)    [(None, 3033)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 3033, 32)     80000       embedding_input[0][0]            
__________________________________________________________________________________________________
dense_input (InputLayer)        [(None, 5)]          0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 50)           16600       embedding[0][0]                  
______________________________________________________________________________________________

In [20]:
model.fit([x_train_neural,x_train_text_features], y_train, epochs = 5, batch_size = 32, verbose = 2, shuffle=True)

Epoch 1/5
61/61 - 259s - loss: 0.7827 - accuracy: 0.5165
Epoch 2/5
61/61 - 254s - loss: 0.6966 - accuracy: 0.5046
Epoch 3/5
61/61 - 255s - loss: 0.6893 - accuracy: 0.5305
Epoch 4/5
61/61 - 254s - loss: 0.6606 - accuracy: 0.6167
Epoch 5/5
61/61 - 258s - loss: 0.5318 - accuracy: 0.7495


<tensorflow.python.keras.callbacks.History at 0x7f9484f8a510>

In [24]:
likelihoods = model.predict([x_val_neural,x_val_text_features])
predictions_final = np.where(likelihoods < 0.5, 0, 1)

In [25]:
print(metrics.f1_score(y_val,predictions_final))
print(predictions_final.shape)

0.7207792207792206
(258, 1)


In [23]:
val_data_id = df_val['id'].values
predictions_final = predictions_final.flatten().tolist()
pred_val = dict(zip(val_data_id, predictions_final))

with open('/content/drive/MyDrive/prediction_out.json', 'w') as fp:
    json.dump(pred_val,fp)