<a href="https://colab.research.google.com/github/ramapriyan912001/Sarcasm-Detection/blob/main/cnn_model_(7).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Reddit Sarcasm Detection

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from gensim import models
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, AveragePooling1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from gensim.test.utils import datapath
import gensim.downloader
from gensim.models import Word2Vec
import math
from nltk.tokenize import TreebankWordTokenizer
from tensorflow.keras.optimizers import Adam, RMSprop, Nadam, SGD
import tensorflow as tf
from tensorflow.keras import layers, initializers, regularizers

In [None]:
import nltk
from nltk.tokenize import TreebankWordTokenizer, WordPunctTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
porter_stemmer = PorterStemmer()
word_tokenizer = TreebankWordTokenizer()
word_tokenizer2 = WordPunctTokenizer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data_cnn = pd.read_csv("./with_sentiment_scores.csv")
data_cnn = data_cnn.iloc[:math.floor(len(data_cnn)), :]
data_cnn["comment"] = data_cnn["comment"].astype(str)
data_cnn["parent_comment"] = data_cnn["parent_comment"].astype(str)

In [None]:
## Define a function that splits training set into just sarcasm and just non-sarcasm
def split_training_dataset_into_separate_labels(training_dataset):
    sarcasm = training_dataset[training_dataset['label'] == 1]
    non_sarcasm = training_dataset[training_dataset['label'] == 0]
    return sarcasm, non_sarcasm

## Define function to engineer features for model such as subreddit history and author history
def feature_history(training_dataset, col):
    history_sarcasm = {}
    history_non_sarcasm = {}
    
    total_comments_by_feature_history = {}
    proportion_sarcasm_by_feature_history = {}
    
    for index, row in training_dataset.iterrows():
        if int(row['label']) == 1:
            if row[col] not in history_sarcasm:
                history_sarcasm[row[col]] = 0
                history_non_sarcasm[row[col]] = 0
            history_sarcasm[row[col]] += 1
    
        elif int(row['label']) == 0:
            if row[col] not in history_non_sarcasm:
                history_non_sarcasm[row[col]] = 0
                history_sarcasm[row[col]] = 0
            history_non_sarcasm[row[col]] += 1
    
    for val in history_sarcasm.keys():
        num_sarcasm = history_sarcasm[val]
        num_non_sarcasm = history_non_sarcasm[val]
        total_comments = num_sarcasm + num_non_sarcasm
        sarcasm_proportion = num_sarcasm/total_comments
        
        proportion_sarcasm_by_feature_history[val] = sarcasm_proportion
        total_comments_by_feature_history[val] = total_comments
    
    return proportion_sarcasm_by_feature_history, total_comments_by_feature_history



## Define function to prepare training dataset

def add_feature_history_to_train(train_dataset, col):
    (proportion_history, total_comments_history) = feature_history(train_dataset, col)
    proportion_col = "sarcasm_proportion_by_" + col
    total_col = "total_num_comments_by_" + col
    
    train_dataset[proportion_col] = train_dataset[col].apply(lambda x: proportion_history[x])
    train_dataset[total_col] = train_dataset[col].apply(lambda x: total_comments_history[x])
    
    return train_dataset

## Define function to prepare testing dataset

def calculate_mean(table):
    values = table.values()
    return sum(values)/(len(values))

def add_feature_history_to_test(test_dataset, col, proportion_history, total_comments_history):
    default_proportion = calculate_mean(proportion_history)
    default_total_comments = calculate_mean(total_comments_history)
    
    def getProportion(col_val):
        proportion = default_proportion
        if col_val in proportion_history:
            proportion = proportion_history[col_val]
    
        return proportion
    
    def getTotal(col_val):
        total = default_total_comments
        if col_val in total_comments_history:
            total = total_comments_history[col_val]
        
        return total
    
    proportion_col = "sarcasm_proportion_by_" + col
    total_col = "total_num_comments_by_" + col
    
    test_dataset[proportion_col] = test_dataset[col].apply(lambda x: getProportion(x))
    test_dataset[total_col] = test_dataset[col].apply(lambda x: getTotal(x))
    
    return test_dataset

### Remove Punctuation

In [None]:
import re
def remove_punctuation(text):
    text_without_punctuation = re.sub(r'[^\w\s]', '',  text)
    return text_without_punctuation

In [None]:
def lemmatize(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

### Lowercasing data

In [None]:
def lower_token(tokens): 
    return [w.lower() for w in tokens]    

### Removing Stop Words

In [None]:
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
def remove_stop_words(tokens): 
    return [word for word in tokens if word not in stoplist]

In [None]:
def change_data(isRemovePunctuation, isLowerCase, isRemoveStopWords, data, isParent):
    cleaned = "comment_cleaned"
    processed = "processed_comment"
    comment = "comment"
    tokens_word = "tokens"
    if isParent:
        cleaned = "parent_" + cleaned
        processed = "parent_" + processed
        comment = "parent_" + comment
        tokens_word = "parent_" + tokens_word
        
    if isRemovePunctuation:
        data[cleaned] = data[comment].apply(lambda x: remove_punctuation(x))
    else:
        data[cleaned] = data[comment]
    print("punctuation")    
    tokens = [word_tokenize(sen) for sen in data[cleaned]]
    tokens = [lemmatize(words) for words in tokens]
    if isLowerCase: 
        adapted_tokens = [lower_token(token) for token in tokens]
    else:
        adapted_tokens = tokens
    print("lowercase")
    if isRemoveStopWords:
        filtered_words = [remove_stop_words(sen) for sen in adapted_tokens]
        data[processed] = [' '.join(token) for token in filtered_words]
        data[tokens_word] = filtered_words
    else:    
        data[processed] = [' '.join(token) for token in adapted_tokens]
        print("processed")
        data[tokens_word] = adapted_tokens
    print("stopwords")    
    return data, tokens

data_cnn, tokens = change_data(True, True, False, data_cnn, False)
data_cnn, parent_tokens = change_data(True, True, False, data_cnn, True)
data_cnn.head()

punctuation
lowercase
processed
stopwords
punctuation
lowercase
processed
stopwords


Unnamed: 0,c_neg,c_neu,c_pos,pc_neg,pc_neu,pc_pos,label,comment,author,subreddit,...,downs,date,created_utc,parent_comment,comment_cleaned,processed_comment,tokens,parent_comment_cleaned,parent_processed_comment,parent_tokens
0,0.125524,0.740467,0.13401,0.208965,0.730244,0.060791,0,NC and NH.,Trumpbart,politics,...,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ...",NC and NH,nc and nh,"[nc, and, nh]",Yeah I get that argument At this point Id pref...,yeah i get that argument at this point id pref...,"[yeah, i, get, that, argument, at, this, point..."
1,0.155209,0.810037,0.034754,0.75465,0.222954,0.022396,0,You do know west teams play against west teams...,Shbshb906,nba,...,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...,You do know west teams play against west teams...,you do know west team play against west team m...,"[you, do, know, west, team, play, against, wes...",The blazers and Mavericks The wests 5 and 6 se...,the blazer and mavericks the west 5 and 6 seed...,"[the, blazer, and, mavericks, the, west, 5, an..."
2,0.071656,0.868689,0.059655,0.007329,0.296819,0.695852,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,...,0,2016-09,2016-09-22 21:45:37,They're favored to win.,They were underdogs earlier today but since Gr...,they were underdog earlier today but since gro...,"[they, were, underdog, earlier, today, but, si...",Theyre favored to win,theyre favored to win,"[theyre, favored, to, win]"
3,0.92333,0.069511,0.007159,0.633449,0.327936,0.038615,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,...,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz,This meme isnt funny none of the new york nigg...,this meme isnt funny none of the new york nigg...,"[this, meme, isnt, funny, none, of, the, new, ...",deadass dont kill my buzz,deadass dont kill my buzz,"[deadass, dont, kill, my, buzz]"
4,0.065801,0.66114,0.273059,0.06733,0.776531,0.15614,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,...,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...,I could use one of those tools,i could use one of those tool,"[i, could, use, one, of, those, tool]",Yep can confirm I saw the tool they use for th...,yep can confirm i saw the tool they use for th...,"[yep, can, confirm, i, saw, the, tool, they, u..."


In [None]:
def make_processed_dataset(data, isParent):
    sarcasm = []
    non_sarcasm = []
    for l in data.label:
        if l == 0:
            sarcasm.append(0)
            non_sarcasm.append(1)
        elif l == 1:
            sarcasm.append(1)
            non_sarcasm.append(0)
    data['sarcasm']= sarcasm
    data['non_sarcasm']= non_sarcasm
    if isParent:
        data_processed = data[['processed_comment', 'parent_processed_comment', 'tokens', 'parent_tokens', 'label', 'sarcasm', 'non_sarcasm', 'subreddit', 
                               'c_pos', 'c_neu', 'c_neg', 'pc_pos', 'pc_neu', 'pc_neg']]
    else:    
        data_processed = data[['processed_comment', 'tokens', 'label', 'sarcasm', 'non_sarcasm', 'subreddit', 'c_pos', 'c_neu',
                              'c_neg', 'pc_pos', 'pc_neu', 'pc_neg']]
    return data_processed


data_cnn_processed = make_processed_dataset(data_cnn, True)

### Split into training and testing data

In [None]:
def split_train_test(data_processed):
    data_train, data_test = train_test_split(
    data_processed,
    test_size = 0.25,
    random_state = 1000)
    return data_train, data_test

data_cnn_train, data_cnn_test = split_train_test(data_cnn_processed)

In [None]:
SUBREDDIT = "subreddit"
data_cnn_train = add_feature_history_to_train(
    data_cnn_train, SUBREDDIT)
proportion_history_subreddit, total_comments_history_subreddit = feature_history(
    data_cnn_train, SUBREDDIT)
data_cnn_test = add_feature_history_to_test(
    data_cnn_test, SUBREDDIT,
    proportion_history_subreddit, total_comments_history_subreddit)

In [None]:
print(data_cnn_train.columns.get_loc("total_num_comments_by_subreddit"))
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(data_cnn_train.iloc[:, 8:])
data_cnn_train.iloc[:,8:] = scaled_values

scaled_values = scaler.fit_transform(data_cnn_test.iloc[:, 8:]) 
data_cnn_test.iloc[:,8:] = scaled_values

15


In [None]:
data_cnn_train[['c_pos', 'c_neu', 'c_neg', 'pc_pos', 'pc_neu', 'pc_neg', 'sarcasm_proportion_by_subreddit', 
                'total_num_comments_by_subreddit']] = data_cnn_train[['c_pos', 'c_neu', 'c_neg', 'pc_pos', 'pc_neu', 'pc_neg', 'sarcasm_proportion_by_subreddit', 
                'total_num_comments_by_subreddit']].apply(lambda x: x/100)
data_cnn_test[['c_pos', 'c_neu', 'c_neg', 'pc_pos', 'pc_neu', 'pc_neg', 'sarcasm_proportion_by_subreddit', 
                'total_num_comments_by_subreddit']] = data_cnn_test[['c_pos', 'c_neu', 'c_neg', 'pc_pos', 'pc_neu', 'pc_neg', 'sarcasm_proportion_by_subreddit', 
                'total_num_comments_by_subreddit']].apply(lambda x: x/100)

In [None]:
print(data_cnn_train.shape)
print(data_cnn_test.shape)
data_cnn_train.head()

(758119, 16)
(252707, 16)


Unnamed: 0,processed_comment,parent_processed_comment,tokens,parent_tokens,label,sarcasm,non_sarcasm,subreddit,c_pos,c_neu,c_neg,pc_pos,pc_neu,pc_neg,sarcasm_proportion_by_subreddit,total_num_comments_by_subreddit
987945,this is not fucking foreshadowing,re watching superbad and found a bit of foresh...,"[this, is, not, fucking, foreshadowing]","[re, watching, superbad, and, found, a, bit, o...",0,0,1,movies,0.003025,0.020896,0.986569,0.070298,0.544439,0.405272,0.483779,0.100207
591874,yeah the fact he doe it every single league on...,poe community is one of the worst i have ever ...,"[yeah, the, fact, he, doe, it, every, single, ...","[poe, community, is, one, of, the, worst, i, h...",1,1,0,pathofexile,0.063181,0.579354,0.379834,0.004294,0.043181,0.962369,0.483137,0.024081
260829,left wing inspector coming thru,another day in tier 1,"[left, wing, inspector, coming, thru]","[another, day, in, tier, 1]",0,0,1,Warthunder,0.069767,0.911826,0.048112,0.110416,0.802874,0.111909,0.530176,0.021866
98631,dear lord,they preordered tgt pack,"[dear, lord]","[they, preordered, tgt, pack]",0,0,1,hearthstone,0.178047,0.661987,0.183138,0.06546,0.90008,0.062097,0.520276,0.067631
232927,shocking that the middle of nowhere would rank...,alberta would rank 4th in global quality of li...,"[shocking, that, the, middle, of, nowhere, wou...","[alberta, would, rank, 4th, in, global, qualit...",1,1,0,canada,0.008718,0.137173,0.867134,0.422987,0.585976,0.009083,0.597545,0.06123


## Download Word2Vec

In [None]:
word2vec = gensim.downloader.load('glove-twitter-200')
##word2vec = gensim.downloader.load('glove-twitter-100')
##word2vec = gensim.downloader.load('word2vec-google-news-300')
##wiki_word2vec = gensim.downloader.load('fasttext-wiki-news-subwords-300')
##word2vec = Word2Vec(sentences=data_cnn_train["tokens"], vector_size=100, window=5, min_count=1, workers=4).wv
##word2vec = Word2Vec(sentences=data_cnn_train["tokens"], vector_size=200, window=5, min_count=1, workers=4).wv
##word2vec = Word2Vec(sentences=data_cnn_train["tokens"], vector_size=300, window=5, min_count=1, workers=4).wv
parent_word2vec = word2vec
##parent_word2vec = Word2Vec(sentences=data_cnn_train["parent_tokens"], vector_size=300, window=5, min_count=1, workers=4).wv
##word2vec = Word2Vec(sentences=data_cnn_train["tokens"], vector_size=400, window=5, min_count=1, workers=4).wv

### Hyperparameters

In [None]:
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 200
num_epochs = 10
batch_size = 34
NUM_FILTERS = 300
PARENT_NUM_FILTERS = 100
FILTER_SIZES = [2,3,4]
PARENT_FILTER_SIZES = [2,3,4]

### Vocab Assembly

In [None]:
all_training_words = [word for tokens in data_cnn_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_cnn_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

7909196 words total, with a vocabulary size of 169575
Max sentence length is 2222


In [None]:
all_test_words = [word for tokens in data_cnn_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_cnn_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

2634403 words total, with a vocabulary size of 87612
Max sentence length is 2000


### Parent Vocab

In [None]:
parent_all_training_words = [word for parent_tokens in data_cnn_train["parent_tokens"] for word in parent_tokens]
parent_training_sentence_lengths = [len(parent_tokens) for parent_tokens in data_cnn_train["parent_tokens"]]
PARENT_TRAINING_VOCAB = sorted(list(set(parent_all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(parent_all_training_words), len(PARENT_TRAINING_VOCAB)))
print("Max sentence length is %s" % max(parent_training_sentence_lengths))

18409070 words total, with a vocabulary size of 262366
Max sentence length is 4198


In [None]:
parent_all_test_words = [word for parent_tokens in data_cnn_test["parent_tokens"] for word in parent_tokens]
parent_test_sentence_lengths = [len(parent_tokens) for parent_tokens in data_cnn_test["parent_tokens"]]
PARENT_TEST_VOCAB = sorted(list(set(parent_all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(parent_all_test_words), len(PARENT_TEST_VOCAB)))
print("Max sentence length is %s" % max(parent_test_sentence_lengths))

6144465 words total, with a vocabulary size of 134517
Max sentence length is 3055


### Tokenization

In [None]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_cnn_train["processed_comment"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_cnn_train["processed_comment"].tolist())
train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))
data_cnn_train_final = pad_sequences(training_sequences, 
                               maxlen=MAX_SEQUENCE_LENGTH)
data_cnn_train_final = np.ndarray.astype(data_cnn_train_final, dtype="float64")
sentiment_train = data_cnn_train[["c_pos", "c_neu", "c_neg", "sarcasm_proportion_by_subreddit", "total_num_comments_by_subreddit"]].to_numpy()
#sentiment_train = np.zeros(data_cnn_train[["c_pos", "c_neu", "c_neg", "sarcasm_proportion_by_subreddit", "total_num_comments_by_subreddit"]].to_numpy().shape)
data_cnn_train_final = np.append(data_cnn_train_final, sentiment_train, axis=1)

Found 168893 unique tokens.


In [None]:
test_sequences = tokenizer.texts_to_sequences(data_cnn_test["processed_comment"].tolist())
data_cnn_test_final = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
data_cnn_test_final = np.ndarray.astype(data_cnn_test_final, dtype="float64")

sentiment_test = data_cnn_test[["c_pos", "c_neu", "c_neg", "sarcasm_proportion_by_subreddit", "total_num_comments_by_subreddit"]].to_numpy()
#sentiment_test = np.zeros(data_cnn_test[["c_pos", "c_neu", "c_neg", "sarcasm_proportion_by_subreddit", "total_num_comments_by_subreddit"]].to_numpy().shape)
data_cnn_test_final = np.append(data_cnn_test_final, sentiment_test, axis=1)

### Parent Tokenization

In [None]:
parent_tokenizer = Tokenizer(num_words=len(PARENT_TRAINING_VOCAB), lower=True, char_level=False)
parent_tokenizer.fit_on_texts(data_cnn_train["parent_processed_comment"].tolist())
parent_training_sequences = parent_tokenizer.texts_to_sequences(data_cnn_train["parent_processed_comment"].tolist())
parent_train_word_index = parent_tokenizer.word_index
print('Found %s unique tokens.' % len(parent_train_word_index))
parent_data_cnn_train_final = pad_sequences(parent_training_sequences, 
                               maxlen=MAX_SEQUENCE_LENGTH)
parent_data_cnn_train_final = np.ndarray.astype(parent_data_cnn_train_final, dtype="float64")
parent_sentiment_train = data_cnn_train[["pc_pos", "pc_neu", "pc_neg", "sarcasm_proportion_by_subreddit", "total_num_comments_by_subreddit"]].to_numpy()
#parent_sentiment_train = np.zeros(data_cnn_train[["pc_pos", "pc_neu", "pc_neg", "sarcasm_proportion_by_subreddit", "total_num_comments_by_subreddit"]].to_numpy().shape)
parent_data_cnn_train_final = np.append(parent_data_cnn_train_final, parent_sentiment_train, axis=1)

Found 261376 unique tokens.


In [None]:
parent_test_sequences = parent_tokenizer.texts_to_sequences(data_cnn_test["parent_processed_comment"].tolist())
parent_data_cnn_test_final = pad_sequences(parent_test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
parent_data_cnn_test_final = np.ndarray.astype(parent_data_cnn_test_final, dtype="float64")
parent_sentiment_test = data_cnn_test[["pc_pos", "pc_neu", "pc_neg", "sarcasm_proportion_by_subreddit", "total_num_comments_by_subreddit"]].to_numpy()
#parent_sentiment_test = np.zeros(data_cnn_test[["pc_pos", "pc_neu", "pc_neg", "sarcasm_proportion_by_subreddit", "total_num_comments_by_subreddit"]].to_numpy().shape)
parent_data_cnn_test_final = np.append(parent_data_cnn_test_final, parent_sentiment_test, axis=1)

### End of Parent Tokenizer

In [1]:
train_embedding_weights = np.zeros((len(train_word_index)+1, 
 EMBEDDING_DIM))
for word,index in train_word_index.items():
 train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

NameError: ignored

In [None]:
parent_train_embedding_weights = np.zeros((len(parent_train_word_index)+1, 
 EMBEDDING_DIM))
for word,index in parent_train_word_index.items():
 parent_train_embedding_weights[index,:] = parent_word2vec[word] if word in parent_word2vec else np.random.rand(EMBEDDING_DIM)
print(parent_train_embedding_weights.shape)

In [None]:
def Parent_Convolution_Net_Alt5(embeddings, parent_embeddings, max_sequence_length, num_words, parent_num_words, 
                           embedding_dim, labels_index, filters, parent_filters, 
                           filter_sizes, parent_filter_sizes):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    parent_embedding_layer = Embedding(parent_num_words,
                            embedding_dim,
                            weights=[parent_embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(2, max_sequence_length+5))
    comment_sequence_input, parent_sequence_input = tf.split(sequence_input, 2, 1)
    comment_sequence_input = tf.squeeze(comment_sequence_input, axis=1)
    parent_sequence_input = tf.squeeze(parent_sequence_input, axis=1)
    
    comment_sequence_input, non_text_comment_input = tf.split(comment_sequence_input, [200,5], 1)
    comment_sequence_input = tf.cast(comment_sequence_input, dtype="int64")
    parent_sequence_input, non_text_parent_input, irrelevant_input = tf.split(parent_sequence_input, [200,3,2], 1)
    parent_sequence_input = tf.cast(parent_sequence_input, dtype="int64")
    
    embedded_sequences = embedding_layer(comment_sequence_input)
    parent_embedded_sequences = parent_embedding_layer(parent_sequence_input)

    convs = []

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=filters, kernel_size=filter_size, 
                        #kernel_initializer='random_normal', bias_initializer='zeros', 
                        #kernel_regularizer=regularizers.l1_l2(l1=1e-6, l2=1e-6), 
                        #bias_regularizer=regularizers.l2(1e-6), activity_regularizer=regularizers.l2(1e-6), 
                        activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)
        
    for filter_size in parent_filter_sizes:
        parent_l_conv = Conv1D(filters=parent_filters, kernel_size=filter_size, 
                               #kernel_initializer='zeros', 
                               #bias_initializer='zeros', 
                               #kernel_regularizer=regularizers.l1_l2(l1=1e-6, l2=1e-6), 
                               #bias_regularizer=regularizers.l2(1e-6), activity_regularizer=regularizers.l2(1e-6), 
                               activation='relu')(parent_embedded_sequences)
        parent_l_pool = GlobalMaxPooling1D()(parent_l_conv)
        convs.append(parent_l_pool)  
        
    #convs.append(non_text_comment_input)
    #convs.append(non_text_parent_input)
    
    l_merge = concatenate(convs, axis=1)
    ## best 128,64
    x = Dropout(0.40)(l_merge)  
    x = Dense(128, 
              #kernel_initializer='zeros', 
              #bias_initializer='random_normal', kernel_regularizer=regularizers.l1_l2(l1=1e-3, l2=1e-3), 
              #bias_regularizer=regularizers.l2(1e-3), activity_regularizer=regularizers.l2(1e-3), 
              activation='relu')(x)
    x = Dense(64, 
              #kernel_initializer='zeros', 
              #bias_initializer='random_normal', kernel_regularizer=regularizers.l1_l2(l1=1e-3, l2=1e-3), 
              #bias_regularizer=regularizers.l2(1e-3), activity_regularizer=regularizers.l2(1e-3), 
              activation='relu')(x)
    #x = concatenate([non_text_comment_input, non_text_parent_input, x], axis=1)
    x = Dropout(0.2)(x)
    x = concatenate([non_text_comment_input, non_text_parent_input, x], axis=1)
    preds = Dense(labels_index, activation='sigmoid')(x)
    adam_optimizer = RMSprop(learning_rate=0.0001, momentum=0.5, clipnorm=0.1, clipvalue=0.1)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam_optimizer,
                  metrics=['acc'])
    #model.summary()
    return model

In [None]:
label_names = ['sarcasm', 'non_sarcasm']
labels = [1, 0]

In [None]:
y_train = data_cnn_train[label_names].values
##x_train = data_cnn_train_final
x_train = np.swapaxes(np.array([data_cnn_train_final, parent_data_cnn_train_final]), 0, 1)
y_tr = y_train
print(x_train.shape)
##x_train.to_csv(r'x_train_glove2000.csv')
##y_train.to_csv(r'y_train_glove2000.csv')

In [None]:
data_cnn_train_final[:5, -5:]

In [None]:
model_alt = Parent_Convolution_Net_Alt5(train_embedding_weights, parent_train_embedding_weights, 
                               MAX_SEQUENCE_LENGTH, len(train_word_index)+1, 
                        len(parent_train_word_index)+1, EMBEDDING_DIM, 
                        len(list(label_names)), NUM_FILTERS, PARENT_NUM_FILTERS, 
                               FILTER_SIZES, PARENT_FILTER_SIZES)

In [None]:
hist = model_alt.fit(x_train, y_tr, epochs=1, validation_split=0.1, shuffle=True, batch_size=128)

In [None]:
# evaluate the model
x_test = np.swapaxes(np.array([data_cnn_test_final, parent_data_cnn_test_final]), 0, 1)
y_test = data_cnn_test.label

train_err = model_alt.evaluate(x_train, y_tr, verbose=0)
test_err = model_alt.evaluate(x_test, y_test, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_mse, test_mse))

In [None]:
# plot loss during training
pyplot.title('Binary CrossEntropy')
pyplot.plot(hist.history['loss'], label='train')
pyplot.plot(hist.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
predictions_test = model_alt.predict(np.swapaxes(np.array([data_cnn_test_final, parent_data_cnn_test_final]),
                                             0, 1), batch_size=1024, verbose=1)
#predictions_train = model.predict(x_train, batch_size=1024, verbose=1)

In [None]:
prediction_labels_test =[]
for p in predictions_test:
    prediction_labels_test.append(labels[np.argmax(p)])
    
#prediction_labels_train =[]
#for p in predictions_train:
    #prediction_labels_train.append(labels[np.argmax(p)])
    
print(data_cnn_test.shape)
print(len(prediction_labels_test))

In [None]:
print(sum(data_cnn_test.label==prediction_labels_test)/len(prediction_labels_test))
#print(sum(data_cnn_train.label==prediction_labels_train)/len(prediction_labels_train))

In [None]:
model.save("twitter200_200dim_200len_3.1_epochs_parent_300_100_model.h5")

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

print(f1_score(data_cnn_test.label, prediction_labels_test))
mat = confusion_matrix(data_cnn_test.label, prediction_labels_test)
disp = ConfusionMatrixDisplay(mat)
disp.plot()

In [None]:
from tensorflow.keras.models import load_model
model = load_model("twitter200_200dim_200len_3_epochs_parent_300_100_model.h5")
model.get_weights()

In [None]:
print(data_cnn_test.label.value_counts())
print(data_cnn_train.label.value_counts())

In [None]:
model_alt.get_weights()[18]