In [None]:
from textblob import TextBlob

In [18]:
import numpy as np
import tensorflow as tf
import pandas as pd
import html
import re
import string
from fncbaseline.utils import dataset, generate_test_splits, score
from fncbaseline import feature_engineering
from nltk.corpus import stopwords
from gensim.summarization import summarize

In [19]:
train_dataset = dataset.DataSet()
test_dataset = dataset.DataSet('test')
generate_test_splits.generate_hold_out_split(train_dataset)
global_map = dict()

Reading dataset
Total stances: 49972
Total bodies: 1683
Reading dataset
Total stances: 25413
Total bodies: 904


In [23]:
def preprocess(text,stop=True,do_summ=False):
    g_text = text
    if g_text in global_map :
        return global_map[g_text]
    
    if do_summ:
        temp = re.sub(r'[.]+',"\n",text)
        if len(temp.split()) > SUMMARY_LEN:
            text = summarize(temp,word_count = SUMMARY_LEN)
              
    text = html.unescape(text)
    text = text.replace("\\n"," ")
    text = text.replace("_NEG","")
    text = text.replace("_NEGFIRST", "")
    text = re.sub(r"#", "", text)
    text = re.sub(r"\*", "", text)
    text = re.sub(r"\'s", "", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r",", "", text)
    text = re.sub(r"!", " !", text)
    text = re.sub(r"\(", "", text)
    text = re.sub(r"\)", "", text)
    text = re.sub(r"\?", " ?", text)
    text = re.sub(r'[^\x00-\x7F]',' ', text)
    text = re.sub(r'[^\w\s]',' ',text)
    text = re.sub("\d+", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = text.rstrip(',|.|;|:|\'|\"')
    text = text.lstrip('\'|\"')
    if stop:
        temp = remove_stopwords(text.strip().lower())
        global_map[g_text] = temp
    else:
        temp = (text.strip().lower())
        global_map[g_text] = temp
    return global_map[g_text]

def remove_stopwords(text):
    temp = stopwords.words('english')
    split_text = \
    [word for word in text.split()
        if word not in temp]
    return " ".join(split_text)



In [26]:
def create_sentiment_features(data):

    features = list()
    sent_dict = dict()
    for stance in data.stances:
        
        if stance['Headline'] not in sent_dict:
            sent_dict[stance['Headline']] = TextBlob(preprocess(stance['Headline'])).sentiment.polarity
        if data.articles[int(stance['Body ID'])] not in sent_dict:
            sent_dict[data.articles[int(stance['Body ID'])]] = TextBlob(preprocess(data.articles[int(stance['Body ID'])])).sentiment.polarity
        
        head_sent = sent_dict[stance['Headline']]
        body_sent = sent_dict[data.articles[int(stance['Body ID'])]]
        features.append([head_sent,body_sent,body_sent-head_sent])
    
    return np.array(features)


In [27]:
train_features = create_sentiment_features(train_dataset)
test_features = create_sentiment_features(test_dataset)

In [30]:
np.save('train_features_sentiment',train_features)
np.save('test_features_sentiment',test_features)

In [None]:
train_fe