In [8]:
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
import pandas as pd
import csv
import keras
import numpy as np
from gensim.models.keyedvectors import KeyedVectors

### Note:
These functions have already been run to produce relevant corpora, vocabulary, word vectors, etc for our experiments. Thus, there is no need to run them again.

## Making combined stance corpus

In [None]:
train_bodies = pd.read_csv("./data/train_bodies.csv")
test_bodies = pd.read_csv("./data/test_bodies.csv")
train_headlines = pd.read_csv("./data/train.csv")
test_headlines = pd.read_csv("./data/test.csv")
combined_bodies = train_bodies.append(test_bodies, ignore_index=True)

'''Get combined train and test'''
def combined_dataset(pd_headlines, combined_bodies, output_file_name, train=True):
    combined = [["Headline","Body","Stance"]]
    for index in range(len(pd_headlines)):
        if(train):
            headline, body_id = pd_headlines.iloc[index].name
        else:
            headline, body_id = pd_headlines.iloc[index].body, pd_headlines.iloc[index].article 
        
        label = pd_headlines.iloc[index].header  
        relevant_body = combined_bodies[combined_bodies['Body ID']==body_id].articleBody.values[0]
        combined.append([headline, relevant_body, label])
    with open(output_file_name,'wb') as f:
        writer = csv.writer(f)
        writer.writerows(combined) 
    
combined_dataset(train_headlines, combined_bodies, "combined_train_stance.csv", train=True)
combined_dataset(test_headlines, combined_bodies, "combined_test_stance.csv", train=False)


## Pruning Word Vectors for task


In [10]:
label_map = {'agree':0,'disagree':1, 'discuss':2, 'unrelated':3}
def tokenize_text(q, lower= False):
    '''Function to obtain word tokens'''
    try:
        tokens = word_tokenize(q.decode('utf-8'))
    except:
        print(q)
        print("assigning as UNK due to error")
        return ["<UNK>"]
    word_tokens = [word for word in tokens if word.isalpha()]  #only include words; not sure if best option
    word_tokens = [word for word in word_tokens if word not in stop_words]
    if(lower):
        word_tokens = map(lambda x: x.lower(), word_tokens) #converting all to lower case
    return word_tokens


'''Make the word tokens vocabulary'''
word_to_int = {}
int_to_word = {}
def make_word_vocab(all_texts):
    '''This function creates vocabulary'''
    for text in all_texts:
        word_tokens = tokenize_text(text,lower=True)
        for token in word_tokens:
            if(token not in word_to_int): #checking if not in vocab
                word_to_int[token] = len(word_to_int)
                int_to_word[len(word_to_int)] = token 
                
                
seq_length_list = []

'''Stance training set'''
#Need to segment into headline, body, label
training_df = pd.read_csv("Stance_Detection/combined_train_stance.csv")
train_headlines = training_df['Headline'].tolist()
train_articles = training_df['Body'].tolist() #store each sequence in list
train_labels = map(lambda x: label_map[x],training_df['Stance'].tolist())
all_texts = map(lambda x: x[0] + " " + x[1], zip(train_headlines, train_articles))  #store concat of headline and article together
#convert labels to one hot encoded 
train_labels = keras.utils.to_categorical(np.asarray(train_labels))


'''Stance testing set'''
test_df = pd.read_csv("Stance_Detection/combined_test_stance.csv")
test_headlines = test_df['Headline'].tolist()
test_articles = test_df['Body'].tolist() #store each sequence in list
test_labels = map(lambda x: label_map[x],test_df['Stance'].tolist())
all_texts += map(lambda x: x[0] + " " + x[1], zip(test_headlines, test_articles))  #store concat of headline and article together
test_labels = keras.utils.to_categorical(np.asarray(test_labels))


'''Input kaggle dataset to get word vectors and vocabulary'''
fake_dataset_df = pd.read_csv("final_kaggle_combined_dataset.csv")
all_texts += map(lambda x: str(x[0]) + " " + str(x[1]), zip(fake_dataset_df['title'].tolist(), fake_dataset_df['text'].tolist()))




        
make_word_vocab(all_texts)  



'''
#Learn word vectors through pretrained and continue training to obtain data specific word embeddings --> resolves issue with unknown words. Then, use wmd implementation and simple averaged vector representation.
'''
word_vectors_loaded = False
    
word_vectors = KeyedVectors.load_word2vec_format("word_vectors/GoogleNews-vectors-negative300.bin", binary=True)
ndims = word_vectors.vector_size
word_vectors_loaded = True
print("Loaded word vectors")

def obtain_only_relevant_vectors(word_vectors,vocab):
    ndims = word_vectors.vector_size
    res_vectors = []
    words_done = []
    for word in set(vocab):
        try:
            res_vectors.append((word, list(word_vectors[word])))
        except:
            continue
    with open('corpus_relevant_vectors_.txt', 'w') as f:
        f.write("{} {}\n".format(len(res_vectors), ndims))
        print(len(res_vectors))
        for word_vec_tuple in res_vectors:
            vec = [str(i) for i in word_vec_tuple[1]]
            try:
                f.write("{} {}\n".format(str(word_vec_tuple[0]), ' '.join(vec)))
            except:
                f.write("{} {}\n".format(word_vec_tuple[0].encode('utf-8'), ' '.join(vec)))
        print("Finished writing corpus relevant vectors")
obtain_only_relevant_vectors(word_vectors, word_to_int.keys())


UnicodeDecodeError: 'utf8' codec can't decode byte 0x92 in position 81: invalid start byte

## Token vocabulary

In [1]:
def save_word_vocab(vocab):
    with open('vocab_.json', 'w') as fp:
        json.dump(vocab, fp)
save_word_vocab(word_to_int)


In [2]:
def load_word_vocab(path="vocab.json"):
    with open(path, 'r') as fp:
        word_to_int = json.load(fp)
        int_to_word = {i:word for word,i in word_to_int.items()}
    return word_to_int, int_to_word
