In [3]:
from FeatureGenerator import *
import ngram
import pickle
import pandas as pd
from nltk.tokenize import sent_tokenize
from helpers import *
import hashlib

class CountFeatureGenerator(FeatureGenerator):
    def __init__(self, name='countFeatureGenerator'):
        super(CountFeatureGenerator, self).__init__(name)
    
    def process(self, df):
        grams = ["unigram", "bigram", "trigram"]
        feat_names = ["Headline", "articleBody"]
        print("generate counting features")
        
        for feat_name in feat_names:
            for gram in grams:
                df["count_of_%s_%s" % (feat_name, gram)] = list(df.apply(lambda x: len(x[feat_name + "_" + gram]), axis=1))
                df["count_of_unique_%s_%s" % (feat_name, gram)] = list(df.apply(lambda x: len(set(x[feat_name + "_" + gram])), axis=1))
                df["ratio_of_unique_%s_%s" % (feat_name, gram)] = list(map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)]))

        # overlapping n-grams count
        for gram in grams:
                df["count_of_Headline_%s_in_articleBody" % gram] = list(df.apply(lambda x: sum([1. for w in x["Headline_" + gram] if w in set(x["articleBody_" + gram])]), axis=1))
                df["ratio_of_Headline_%s_in_articleBody" % gram] = list(map(try_divide, df["count_of_Headline_%s_in_articleBody" % gram], df["count_of_Headline_%s" % gram]))

        # number of sentences in headline and body
        for feat_name in feat_names:
            df['len_sent_%s' % feat_name] = df[feat_name].apply(lambda x: len(sent_tokenize(x)))
        
        
        # dump the basic counting features into a file
        feat_names = [ n for n in df.columns if "count" in n  or "ratio" in n or "len_sent" in n]
        
        # binary refuting features
        _refuting_words = [
            'fake',
            'fraud',
            'hoax',
            'false',
            'deny', 'denies',
            # 'refute',
            'not',
            'despite',
            'nope',
            'doubt', 'doubts',
            'bogus',
            'debunk',
            'pranks',
            'retract'
        ]

        
        check_words = _refuting_words
        for rf in check_words:
            fname = '%s_exist' % rf
            feat_names.append(fname)
            df[fname] = list(df['Headline'].map(lambda x: 1 if rf in x else 0))
        

        print('BasicCountFeatures:')
        print(df)
        
        train = df[~df['target'].isnull()]
        print('train:')
        print(train[['Headline_unigram','Body ID', 'count_of_Headline_unigram']])
        xBasicCountsTrain = train[feat_names].values
        outfilename_bcf_train = "train.basic.pkl"
        with open(outfilename_bcf_train, "wb") as outfile:
            pickle.dump(feat_names, outfile, -1)
            pickle.dump(xBasicCountsTrain, outfile, -1)
        print('basic counting features for training saved in %s' % outfilename_bcf_train)
        
        test = df[df['target'].isnull()]
        print('test:')
        print(test[['Headline_unigram','Body ID', 'count_of_Headline_unigram']])
        if test.shape[0] > 0:
            # test set exists
            print('saving test set')
            xBasicCountsTest = test[feat_names].values
            outfilename_bcf_test = "test.basic.pkl"
            with open(outfilename_bcf_test, 'wb') as outfile:
                pickle.dump(feat_names, outfile, -1)
                pickle.dump(xBasicCountsTest, outfile, -1)
                print('basic counting features for test saved in %s' % outfilename_bcf_test)
            
    def read(self, header='train'):
        filename_bcf = "%s.basic.pkl" % header
        with open(filename_bcf, "rb") as infile:
            feat_names = pickle.load(infile)
            xBasicCounts = pickle.load(infile)
            print('feature names: ')
            print(feat_names)
            print('xBasicCounts.shape:')
            print(xBasicCounts.shape)
            np.save('counts_test', [xBasicCounts])
        return [xBasicCounts]
if __name__ == '__main__':

    cf = CountFeatureGenerator()
    cf.read('test')



feature names: 
['count_of_Headline_unigram', 'count_of_unique_Headline_unigram', 'ratio_of_unique_Headline_unigram', 'count_of_Headline_bigram', 'count_of_unique_Headline_bigram', 'ratio_of_unique_Headline_bigram', 'count_of_Headline_trigram', 'count_of_unique_Headline_trigram', 'ratio_of_unique_Headline_trigram', 'count_of_articleBody_unigram', 'count_of_unique_articleBody_unigram', 'ratio_of_unique_articleBody_unigram', 'count_of_articleBody_bigram', 'count_of_unique_articleBody_bigram', 'ratio_of_unique_articleBody_bigram', 'count_of_articleBody_trigram', 'count_of_unique_articleBody_trigram', 'ratio_of_unique_articleBody_trigram', 'count_of_Headline_unigram_in_articleBody', 'ratio_of_Headline_unigram_in_articleBody', 'count_of_Headline_bigram_in_articleBody', 'ratio_of_Headline_bigram_in_articleBody', 'count_of_Headline_trigram_in_articleBody', 'ratio_of_Headline_trigram_in_articleBody', 'len_sent_Headline', 'len_sent_articleBody', 'fake_exist', 'fraud_exist', 'hoax_exist', 'false