In [None]:
# Paths to data files and output file on drive
train_data_file = '/content/drive/MyDrive/train-data-prepared.json'
val_data_file = '/content/drive/MyDrive/val-data-prepared.json' #needs to be modified for test file
pred_out_file = '/content/drive/MyDrive/prediction_out.json'

In [None]:
# Import all dependencies
import numpy as np
import pandas as pd
import spacy
import nltk
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding,InputLayer,Concatenate
from tensorflow.keras import Model
from sklearn import metrics
import keras

#Spacy english model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Function to Preprocess the text data: Lower Casing, Remove URLs, punctuations and stripping extra spaces
def PreprocessData(df):
  df['clean_text'] = df['_body'].str.lower()
  df['clean_text'] = df['clean_text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ') #remove URL
  df['clean_text'] = df['clean_text'].str.strip()
  df['clean_text'] = df['clean_text'].str.replace('[^\w\s]','')
  return df

# Function to concatenate posts of same ID
def ConcatenatePost(df):
  df['clean_text_copy'] = df.groupby(['id'])['clean_text'].transform(lambda x : ' '.join(x))
  df['clean_text'] = df.groupby(['id'])['clean_text'].transform(lambda x : '||'.join(x))
  df['_body'] = df.groupby(['id'])['_body'].transform(lambda x : '|'.join(x))
  df['label'] = df.groupby(['id'])['label'].transform('max')
  df = df.drop_duplicates(inplace=False)
  return df

# Function to extract lexical features from text
def FeatureExtract(df):

  offensive_words = ["ass","idiot","moron","stupid","bitch","shit","fuck","dumb","fool","pussy"]
  advmod_exist_list = [] #intensifiers eg: absolutely, very, extremely, seriously etc
  prp_exist_list = [] #personal pronouns eg: I, me, you etc
  acomp_exist_list = [] #adjectivial complement eg: unreal, unsatisfactory, unwilling
  relcl_exist_list = [] #eg: hurts, torutres, celebrates
  abuse_exist_list = [] #bad and offensive words
  for text in df['clean_text_copy']:
    doc = nlp(text)
    advmod_exist = 0
    prp_exist = 0
    acomp_exist = 0
    relcl_exist = 0
    abuse_exist = 0
    for token in doc:
      if token.dep_ == 'advmod':
        advmod_exist = advmod_exist + 1
      
      if token.tag_ == 'PRP':
        prp_exist = prp_exist + 1
      
      if token.pos_ == 'ADJ' and token.dep_ == 'acomp':
        acomp_exist = acomp_exist + 1
      
      if token.text in offensive_words:
        abuse_exist = abuse_exist + 1
      
      if token.dep_ == 'relcl':
        relcl_exist = relcl_exist + 1
    
    advmod_exist_list.append(advmod_exist)
    prp_exist_list.append(prp_exist)
    acomp_exist_list.append(acomp_exist)
    abuse_exist_list.append(abuse_exist)
    relcl_exist_list.append(relcl_exist)
  
  df['intensifier'] = advmod_exist_list
  df['prp'] = prp_exist_list
  df['acomp'] = acomp_exist_list
  df['abuse'] = abuse_exist_list
  df['relcl'] = relcl_exist_list
  return df

In [None]:
# Read the train and validation dataset
with open(train_data_file, 'r') as f:
  json_data_train = json.load(f)

with open(val_data_file, 'r') as f:
  json_data_val = json.load(f)

# Normalizing the nested structure
init_data_train = pd.json_normalize(json_data_train, record_path='preceding_posts',meta= ['id','label'],max_level=1, record_prefix='_')
init_data_val = pd.json_normalize(json_data_val, record_path='preceding_posts',meta= ['id','label'],max_level=1, record_prefix='_')

# Selecting data on interest
data_train = init_data_train[['id','_body','label']]
data_val = init_data_val[['id','_body','label']]

# Function call to preprocess data
df_train = PreprocessData(data_train)
df_val = PreprocessData(data_val)

# Function call to concatenate posts of same ID
df_train = ConcatenatePost(df_train)
df_val = ConcatenatePost(df_val)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

In [None]:
# Extract lexical features
df_train = FeatureExtract(df_train)
df_val = FeatureExtract(df_val)

# creating feature matrix of train and val dataset
x_train_text_features = df_train[["intensifier","prp","acomp","abuse","relcl"]].values
x_val_text_features = df_val[["intensifier","prp","acomp","abuse","relcl"]].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [None]:
# Tokenizer object to generate integer values for a vocabulary of words. This is prerequise for generating embeddings using keras

max_vocab = 2500
tokenizer = Tokenizer(num_words = max_vocab,split = ' ', oov_token=1 )

# Fit the tokenizer and transform tokens to integer sequence
tokenizer.fit_on_texts(df_train['clean_text_copy'].values)
x_train_neural =  tokenizer.texts_to_sequences(df_train['clean_text_copy'].values)
x_val_neural = tokenizer.texts_to_sequences(df_val['clean_text_copy'])

# Pad the sequence to max length for uniformity of matrix
max_length = max(len(s.split()) for s in df_train['clean_text_copy'].values)
x_train_neural = pad_sequences(x_train_neural,maxlen=max_length)
x_val_neural = pad_sequences(x_val_neural,maxlen=max_length)

y_train = df_train['label'].values
y_val = df_val['label'].values

In [None]:
# Neural Network for Classification

embed_dim = 32
lstm_out = 50

# Model 1: Embeddibng layer to generate embedding and LSTM network for sequential modelling. Input- word tokenized matrix
model1 = Sequential()
model1.add(Embedding(max_vocab,embed_dim,input_length = x_train_neural.shape[1]))
model1.add(LSTM(lstm_out,dropout = 0.2,recurrent_dropout = 0.2))

# Model 2: Feed-forward network with 1 hidden layer. Input- Lexical feature matrix
model2 = Sequential()
model2.add(Dense(10,activation = 'relu',input_shape = (5,)))

# Merge Model 1 and Model 2. Merged model has 1 hidden layer.
merged_model = Concatenate()([model1.output,model2.output])
z = Dense(10,activation = 'relu')(merged_model)
z = Dense(1,activation = 'sigmoid')(z)

model = Model(inputs=[model1.input, model2.input], outputs=z)
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [None]:
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_input (InputLayer)    [(None, 3033)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 3033, 32)     80000       embedding_input[0][0]            
__________________________________________________________________________________________________
dense_input (InputLayer)        [(None, 5)]          0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 50)           16600       embedding[0][0]                  
______________________________________________________________________________________________

In [None]:
# Fitting the neural network
model.fit([x_train_neural,x_train_text_features], y_train, epochs = 7, batch_size = 64, verbose = 2,shuffle=True)

Epoch 1/7
31/31 - 136s - loss: 0.8847 - accuracy: 0.4747
Epoch 2/7
31/31 - 132s - loss: 0.7100 - accuracy: 0.5114
Epoch 3/7
31/31 - 131s - loss: 0.6727 - accuracy: 0.5992
Epoch 4/7
31/31 - 131s - loss: 0.5749 - accuracy: 0.7252
Epoch 5/7
31/31 - 131s - loss: 0.4360 - accuracy: 0.8073
Epoch 6/7
31/31 - 133s - loss: 0.3251 - accuracy: 0.8709
Epoch 7/7
31/31 - 134s - loss: 0.2588 - accuracy: 0.8972


<tensorflow.python.keras.callbacks.History at 0x7f111d277550>

In [None]:
# prediciting the likelihoods and generating predictions using threshhold

likelihoods = model.predict([x_val_neural,x_val_text_features])
predictions = np.where(likelihoods < 0.5, 0, 1)

In [None]:
# Generate dictionary of ID and Prediction and save it to a file

val_data_id = df_val['id'].values
predictions_final = predictions.flatten().tolist()
pred_val = dict(zip(val_data_id, predictions_final))

with open(pred_out_file, 'w') as fp:
    json.dump(pred_val,fp)