In [1]:
#dependencies
import pickle
from keras.models import model_from_json
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
import json
from keras.preprocessing.sequence import pad_sequences
import keras
import numpy as np
import csv
from sklearn import metrics

Using TensorFlow backend.


In [2]:
#directories

input_csv = "data_stance/combined_test_stance.csv" #of format headline, body
tfidf_vectorizer_articles_dir = 'best_stance_model_parameters/articles_tfidf_vectorizer.pk'
tfidf_vectorizer_headlines_dir = 'best_stance_model_parameters/headline_tfidf_vectorizer.pk'
tfidf_dims_file = 'best_stance_model_parameters/tf_idf_dims.txt'
final_dl_model_h5 = 'best_stance_model_parameters/final_tfidf_bilstm.h5'
final_dl_model_json = 'best_stance_model_parameters/final_tfidf_bilstm.json'
vocab_dir = "vocab.json"
max_head_length = 30 #assumption max headline will be 30 words
max_article_length = 200 #HAVE TO STORE AS a txt later
#word_vectors_dir = "../word_vectors/corpus_relevant_vectors.txt" #already stored by keras

In [3]:
"""Loading tfidf vectorizer and word vocab"""
with open(tfidf_vectorizer_articles_dir, 'r') as fin:
    tfidf_vectorizer_headlines = pickle.load(fin)
    
with open(tfidf_vectorizer_headlines_dir,'r') as fin:
    tfidf_vectorizer_articles = pickle.load(fin)
    
with open(tfidf_dims_file,'r') as fin:
    tfidf_dims = int(fin.read())

def load_word_vocab(path="vocab.json"):
    with open(path, 'r') as fp:
        word_to_int = json.load(fp)
        int_to_word = {i:word for word,i in word_to_int.items()}
    return word_to_int, int_to_word

word_to_int, int_to_word = load_word_vocab(vocab_dir)
word_to_int['<UNK>'] = len(word_to_int)
int_to_word[len(int_to_word)] = "<UNK>"
word_tokens_in_corpus = word_to_int.keys()

print("Loaded vocab with {} tokens ".format(len(word_tokens_in_corpus)))


Loaded vocab with 155776 tokens 


In [4]:
'''Loading trained model for prediction'''
json_file = open(final_dl_model_json,'r')
loaded_model_json = json_file.read()
json_file.close()
stance_model = model_from_json(loaded_model_json)
stance_model.load_weights(final_dl_model_h5)
print("Succesfully loaded model")


Succesfully loaded model


In [5]:
'''Text processing functions'''
def tokenize_text(q, lower= True):
    #Obtain word tokens 
    try:
        tokens = word_tokenize(q.decode('utf-8'))
    #except UnicodeDecodeError:
     #   tokens = word_tokenize(q.decode('utf-8'))
    except:
        tokens = ["<UNK>"]
    #print(q)
    word_tokens = [word for word in tokens if word.isalpha()]  #only include words; not sure if best option
    word_tokens = [word for word in word_tokens if word not in stop_words]
    if(lower):
        word_tokens = map(lambda x: x.lower(), word_tokens) #converting all to lower case
    return word_tokens


seq_length_list = []

def get_word_to_int_sequence(tokens):
    '''Returns sequence and updates vocab'''
    '''Does increasing number of functions impact performance?'''
    seq = []
    #global max_seq_length
    for token in tokens:
        if(token not in word_to_int):
            word_to_int[token] = len(word_to_int)
            int_to_word[len(word_to_int)] = token
            seq.append(word_to_int[token])
        else:
            seq.append(word_to_int[token])
    #if(len(seq)>max_seq_length):
     #   max_seq_length = len(seq)
    seq_length_list.append(len(seq))
    return seq

def get_predictions_metrics(predictions, test_labels):
    predicted_labels = map(lambda x: np.argmax(x), predictions)
    ground_truth_labels = map(lambda x: np.argmax(x), test_labels)
    accuracy = metrics.accuracy_score(predicted_labels, ground_truth_labels)
    precision = metrics.precision_score(predicted_labels, ground_truth_labels, average = 'micro')
    recall = metrics.recall_score(predicted_labels, ground_truth_labels, average = 'micro')
    f1 = metrics.f1_score(predicted_labels, ground_truth_labels, average = 'micro')
    conf_matrix = metrics.confusion_matrix(predicted_labels, ground_truth_labels)
    return accuracy, precision, recall, f1, conf_matrix

label_map = {'agree':0,'disagree':1, 'discuss':2, 'unrelated':3}
inverse_label_map = {num:string for string,num in label_map.items()}

In [6]:
def get_stance(headlines, bodies):
    """Generate tfidf vectors for inputs"""
    headlines_tfidf_vec = tfidf_vectorizer_headlines.transform(headlines).todense()
    articles_tfidf_vec = tfidf_vectorizer_articles.transform(bodies).todense()
    concatenated_tfidf_vec = np.concatenate((headlines_tfidf_vec, articles_tfidf_vec), axis =-1)
    assert concatenated_tfidf_vec.shape[-1] == tfidf_dims
    """Create headline and article sequences"""
    headline_sequences = map(lambda x: get_word_to_int_sequence(tokenize_text(x)),headlines) #converts each sentence to sequence of words
    article_sequences =  map(lambda x: get_word_to_int_sequence(tokenize_text(x)),bodies)

    headline_sequences = pad_sequences(headline_sequences, maxlen = max_head_length)
    article_sequences = pad_sequences(article_sequences, maxlen = max_article_length)
    '''Use model to predict'''
    predictions = stance_model.predict([headline_sequences, article_sequences, concatenated_tfidf_vec], batch_size = 128)
    predicted_labels = np.argmax(predictions, axis=1)
    return predictions, predicted_labels

## Stance for testing set

In [9]:
'''Read in input csv for which predictions are required'''
#input_structure = [["Headline1","Headline2"],["Body1","Body2"]] #this is a list containing 



name="test_set" #name for output
df_input_csv = pd.read_csv(input_csv)
headlines = df_input_csv['Headline'].tolist()
bodies = df_input_csv['Body'].tolist()




In [None]:
prediction_dist, predicted_labels = get_stance(headlines, bodies)

In [11]:
'''Write output predictions'''
with open('{}_output_predictions.csv'.format(name),'w') as f:
    writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
    writer.writerow(["Headline","Body","Agree","Disagree","Discuss","Unrelated","Classification"])
    for i in range(len(predicted_labels)):
        output_line = [headlines[i], bodies[i], prediction_dist[i][0], prediction_dist[i][1], prediction_dist[i][2], prediction_dist[i][3], inverse_label_map[predicted_labels[i]]]
        writer.writerow(output_line)
print("Outputs written to: {}".format('{}_output_predictions.csv'.format(name)))

Outputs written to: test_set_output_predictions.csv


## Stance for kaggle subset

In [7]:
name = "kaggle_first_1000"
input_csv = "../final_combined_kaggle_scraped_pruned.csv"
df_input_csv = pd.read_csv(input_csv)
headlines = df_input_csv['title'].tolist()[:1000]
bodies = df_input_csv['text'].tolist()[:1000]

In [8]:
prediction_dist, predicted_labels = get_stance(headlines, bodies)

In [9]:
'''Write output predictions'''
with open('{}_output_predictions.csv'.format(name),'w') as f:
    writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
    writer.writerow(["Headline","Body","Agree","Disagree","Discuss","Unrelated","Classification"])
    for i in range(len(predicted_labels)):
        output_line = [headlines[i], bodies[i], prediction_dist[i][0], prediction_dist[i][1], prediction_dist[i][2], prediction_dist[i][3], inverse_label_map[predicted_labels[i]]]
        writer.writerow(output_line)
print("Outputs written to: {}".format('{}_output_predictions.csv'.format(name)))

Outputs written to: kaggle_first_1000_output_predictions.csv


In [17]:
len(np.where(predicted_labels==1)[0])

13

In [32]:
'''
Use model to predict
predictions = stance_model.predict([headline_sequences, article_sequences, concatenated_tfidf_vec], batch_size = 128)
predicted_labels = np.argmax(predictions, axis=1)
#predicted_labels = 
'''

In [None]:

body = "The name Barack Obama infuriates President Trump. I believe he hates Obama more than he hates crooked Hillary, mainly because Obama was a successful and very popular president. So it was no surprise when one of the first things Trump wanted to do after taking office is tear apart Obamacare. If it had simply been called Health Care for Everyone or something like that, Trump might not have noticed. But it had Obama's name on it, and Trump couldn't tolerate that. So he wanted it torn down. So far, his efforts have totally failed, but that's nothing new. Next, Trump wants to erase all the good Obama did with easing U.S.-Cuba relations. The embargo is still in place. And yes, human rights violations are still taking place all the time in Cuba. But people are traveling there. It has been opened up to trade and to some news agencies. It is a relationship that will help the Cuban people in the long run. And it gives Americans a chance to see a nearby country that has been a mystery for decades."


In [None]:
title