In [1]:
from keras.models import Model, load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

import numpy as np
import pandas as pd

def comments_to_seq2(c):
    seq = [[np.asarray(
        [0 if "..." in x else 1,
        0 if x.upper() == x else 1,
        0 if "!" in x else 1])
        for x in comment.split(" ")] for comment in c]
    return seq

MAX_NB_WORDS = 50000 #50000 unique words?
MAX_SEQUENCE_LENGTH = 30 #30 word sentences?

key_df =pd.read_csv('data/key.csv', sep='\t')
train_df = pd.read_csv('data/train-balanced.csv', sep='\t', header=None, names=list(key_df))
tweet_df = pd.read_csv('clean_tweet_data.tsv', sep='\t')
print("Read in CSVs")
comments = [str(x) for x in list(train_df["comment"])] #add in parent comment later
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(comments)
print("Fitted Tokenizer to text")


Using TensorFlow backend.


Read in CSVs




Fitted Tokenizer to text


In [7]:
import re
input_comments = [str(x) for x in list(tweet_df["clean_tweet_text"])]

sequences = tokenizer.texts_to_sequences(input_comments)

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
data2 = pad_sequences(comments_to_seq2(input_comments), maxlen=MAX_SEQUENCE_LENGTH)

model = load_model("large_feature_model_2.h5")


st_ser = tweet_df['tweet_text'].str.contains("#sarcasm|#sarcastic", flags=re.IGNORECASE)
labels_bin_hash = [1 if x else 0 for x in st_ser]
labels_hash = to_categorical(labels_bin_hash)

labels_bin = np.asarray(tweet_df["sarcastic"])

results = model.predict(x=[data, data2])
y_predict = np.asarray([np.argmax(x) for x in results])
print(labels_bin)
print(y_predict)

print("Hashtag scores:")
print("Precision:", precision_score(labels_bin_hash, y_predict))
print("Recall:", recall_score(labels_bin_hash, y_predict))
print("F1:", f1_score(labels_bin_hash, y_predict))

print(model.evaluate(x=[data, data2], y=labels_hash))
print(model.metrics_names)

print("Annotated scores:")
print("Precision:", precision_score(labels_bin, y_predict))
print("Recall:", recall_score(labels_bin, y_predict))
print("F1:", f1_score(labels_bin, y_predict))

print(model.evaluate(x=[data, data2], y=labels))
print(model.metrics_names)

[0 0 0 ..., 0 0 0]
[0 1 0 ..., 0 0 0]
Hashtag scores:
Precision: 0.666666666667
Recall: 0.229787234043
F1: 0.341772151899
['loss', 'acc']
Annotated scores:
Precision: 0.172839506173
Recall: 0.172839506173
F1: 0.172839506173
['loss', 'acc']


In [5]:
test_df = pd.read_csv('data/test-balanced.csv', sep='\t', header=None, names=list(key_df))
test_comments = [str(x) for x in list(test_df["comment"])] 

sequences_test = tokenizer.texts_to_sequences(test_comments)

data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
data2_test = pad_sequences(comments_to_seq2(test_comments), maxlen=MAX_SEQUENCE_LENGTH)

labels_bin_test = np.asarray(test_df["label"])

results_test = model.predict(x=[data_test, data2_test])
y_predict_test = np.asarray([np.argmax(x) for x in results_test])

print("Precision:", precision_score(labels_bin_test, y_predict_test))
print("Recall:", recall_score(labels_bin_test, y_predict_test))
print("F1:", f1_score(labels_bin_test, y_predict_test))

print(labels_bin_test)
print(y_predict_test)

Precision: 0.751416769855
Recall: 0.660837493243
F1: 0.703222341114
[0 0 0 ..., 1 1 1]
[0 1 0 ..., 0 0 1]


In [21]:
pd.options.display.max_colwidth = 50
len(tweet_df[tweet_df["sarcastic"] == 1]["clean_tweet_text"])

324

In [45]:
import re
sarcastic_predicted_indices = []
for i in range(len(y_predict)):
    if y_predict[i] == 1:
        sarcastic_predicted_indices.append(i)
s_df = tweet_df.iloc[sarcastic_predicted_indices]
s_ser = s_df['tweet_text'].str.contains("#sarcasm|#sarcastic", flags=re.IGNORECASE)
s_ar = np.asarray(s_ser)
(s_ar == True).sum()

216