In [201]:
import re
import nltk
import math
import csv
import numpy as np
#import feature_matrix
#import tensorflow
import string
from itertools import chain
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
import RandomTree as rt
from sklearn.ensemble import RandomForestClassifier
from keras.preprocessing.text import one_hot
from sklearn import svm
from sklearn.svm import SVC
from sklearn import tree
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
%run /home/rashi/Desktop/Fakenews/utils.ipynb


In [202]:
EN_STOPWORDS = stopwords.words('english') + list(string.punctuation)
def signatures(ambiguous_word, pos=None,
               hyperhypo=True, adapted=False,
               remove_stopwords=True, to_lemmatize=True):
    # Ensure that the POS is supported.
    pos = pos if pos in ['a', 'r', 's', 'n', 'v', None] else None
    # Holds the synset->signature dictionary.
    synsets_signatures = {}
    for ss in wn.synsets(ambiguous_word, pos=pos):
        signature = []
        # Adds the definition, example sentences and lemma_names.
        signature += word_tokenize(ss.definition())
        signature += chain(*[word_tokenize(eg) for eg in ss.examples()])
        signature += ss.lemma_names()
        # Optional: includes lemma_names of hyper-/hyponyms.
        if hyperhypo:
            hyperhypo = set(ss.hyponyms() + ss.hypernyms() + ss.instance_hyponyms() + ss.instance_hypernyms())
            signature += set(chain(*[i.lemma_names() for i in hyperhypo]))
        # Optional: Includes signatures from related senses as in Adapted Lesk.
        if adapted:
            # Includes lemma_names from holonyms, meronyms and similar_tos
            related_senses = set(ss.member_holonyms() + ss.part_holonyms() + ss.substance_holonyms() +
                                 ss.member_meronyms() + ss.part_meronyms() + ss.substance_meronyms() +
                                 ss.similar_tos())
            signature += set(chain(*[i.lemma_names() for i in related_senses]))
        # Optional: removes stopwords.
        if remove_stopwords:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Lemmatized context is preferred over stemmed context.
        if to_lemmatize:
            signature = [lemmatize(i) for i in signature]
        synsets_signatures[ss] = signature
    return synsets_signatures

In [203]:
def compare_overlaps_greedy(context, synsets_signatures):
    """
    Calculate overlaps between the context sentence and the synset_signature
    and returns the synset with the highest overlap.
    Note: Greedy algorithm only keeps the best sense,
    see https://en.wikipedia.org/wiki/Greedy_algorithm
    Only used by original_lesk(). Keeping greedy algorithm for documentary sake,
    because original_lesks is greedy.
    """
    max_overlaps = 0; lesk_sense = None
    for ss in synsets_signatures:
        overlaps = set(synsets_signatures[ss]).intersection(context)
        if len(overlaps) > max_overlaps:
            lesk_sense = ss
            max_overlaps = len(overlaps)
    return lesk_sense


In [204]:
def compare_overlaps(context, synsets_signatures, \
                     nbest=False, keepscore=False, normalizescore=False):
    """
    Calculates overlaps between the context sentence and the synset_signture
    and returns a ranked list of synsets from highest overlap to lowest.
    """
    overlaplen_synsets = [] # a tuple of (len(overlap), synset).
    for ss in synsets_signatures:
        overlaps = set(synsets_signatures[ss]).intersection(context)
        overlaplen_synsets.append((len(overlaps), ss))

    # Rank synsets from highest to lowest overlap.
    ranked_synsets = sorted(overlaplen_synsets, reverse=True)

    # Normalize scores such that it's between 0 to 1.
    if normalizescore:
        total = float(sum(i[0] for i in ranked_synsets))
        ranked_synsets = [(i/total,j) for i,j in ranked_synsets]

    if not keepscore: # Returns a list of ranked synsets without scores
        ranked_synsets = [i[1] for i in sorted(overlaplen_synsets, \
                                               reverse=True)]

    if nbest: # Returns a ranked list of synsets.
        return ranked_synsets
    else: # Returns only the best sense.
        return ranked_synsets[1]

In [205]:
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \
                     hyperhypo=True, stop=True):
    """
    Returns a synsets_signatures dictionary that includes signature words of a
    sense from its:
    (i)   definition
    (ii)  example sentences
    (iii) hypernyms and hyponyms
    """
    synsets_signatures = {}
    for ss in wn.synsets(ambiguous_word):
        try: # If POS is specified.
            if pos and str(ss.pos()) != pos:
                continue
        except:
            if pos and str(ss.pos) != pos:
                continue
        signature = []
        # Includes definition.
        ss_definition = synset_properties(ss, 'definition')
        signature+=ss_definition
        # Includes examples
        ss_examples = synset_properties(ss, 'examples')
        signature+=list(chain(*[i.split() for i in ss_examples]))
        # Includes lemma_names.
        ss_lemma_names = synset_properties(ss, 'lemma_names')
        signature+= ss_lemma_names

        # Optional: includes lemma_names of hypernyms and hyponyms.
        if hyperhypo == True:
            ss_hyponyms = synset_properties(ss, 'hyponyms')
            ss_hypernyms = synset_properties(ss, 'hypernyms')
            ss_hypohypernyms = ss_hypernyms+ss_hyponyms
            signature+= list(chain(*[i.lemma_names() for i in ss_hypohypernyms]))

        # Optional: removes stopwords.
        if stop == True:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Lemmatized context is preferred over stemmed context.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Matching exact words may cause sparsity, so optional matching for stems.
        if stem == True:
            signature = [porter.stem(i) for i in signature]
        synsets_signatures[ss] = signature

    return synsets_signatures

In [206]:
def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    This function is the implementation of the Adapted Lesk algorithm,
    described in Banerjee and Pederson (2002). It makes use of the lexical
    items from semantically related senses within the wordnet
    hierarchies and to generate more lexical items for each sense.
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    signature = []
    tt = ""
    for ss in ss_sign:
        # Includes holonyms.
        ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
        ss_part_holonyms = synset_properties(ss, 'part_holonyms')
        ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
        # Includes meronyms.
        ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
        ss_part_meronyms = synset_properties(ss, 'part_meronyms')
        ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
        # Includes similar_tos
        ss_simto = synset_properties(ss, 'similar_tos')

        related_senses = list(set(ss_mem_holonyms+ss_part_holonyms+
                                  ss_sub_holonyms+ss_mem_meronyms+
                                  ss_part_meronyms+ss_sub_meronyms+ ss_simto))

        signature = list([j for j in chain(*[synset_properties(i, 'lemma_names')
                                             for i in related_senses])
                          if j not in EN_STOPWORDS])
        tt = ss

    # Lemmatized context is preferred over stemmed context
    if lemma == True:
        signature = [lemmatize(i) for i in signature]
    # Matching exact words causes sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]
    # Adds the extended signature to the simple signatures.
    ss_sign[tt]+=signature

    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense

In [207]:
#Function to extract adjectives from the contents. Return a list of strings
#Need nltk library: import nltk
def extract_adjective(sentences):
    adj_sentences = list()
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        adj_tags = nltk.pos_tag(words)
        one_adj_sentence = ""
        for index, tag in enumerate(adj_tags, start = 0):
            one_tag = tag[1]
            if one_tag in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']:
                sense = adapted_lesk(sentence, words[index], one_tag)
                #print('\n\n')
                #print(sense)
                if not sense:
                    one_adj_sentence += words[index]
                if sense:
                    one_adj_sentence += sense.lemmas()[0].name()
                    #print(sense.lemmas()[0].name())
                one_adj_sentence += " "
        adj_sentences.append(one_adj_sentence)
        #print(one_adj_sentence)
    return adj_sentences

In [208]:
def clean_sentence(s):
    c = s.lower().strip()
    return re.sub('[^a-z ]', '', c)

In [209]:
def encode_words(sentences):
    encoded = list()
    for sentence in sentences:
        words = list()
        mapping = list()
        words = nltk.word_tokenize(sentence)
        for word in words:
            mapping.append(one_hot(word,10000)[0])
        encoded.append(mapping);
    return encoded

In [210]:
def append_with_max_len(encoded,length):
    for e in encoded:
        for count in range(len(e), length):
            e.append(0)
    return encoded;

In [211]:
def set_label(fake_size, real_size):
    label = list()
    for counter in range(0,fake_size):
        label.append(0)
    for counter in range(0,real_size):
        label.append(1)
    return label

In [212]:
train_news = list()
fake_size = 0
real_size = 0
with open('./data/titles/fake_news_training.txt') as train1:
    with open('./data/titles/real_news_training.txt') as train2:
        for line in train1:
            train_news.append(clean_sentence(line))
            fake_size = fake_size+1
        for line in train2:
            train_news.append(clean_sentence(line))
            real_size = real_size+1
predict_news = list()
count = 0
count1 = 0
with open('./data/real2.txt') as predict1:
    with open('./data/fake2.txt') as predict2:
        for line in predict1:
            predict_news.append(clean_sentence(line))
            count = count+1
        for line in predict2:
            predict_news.append(clean_sentence(line))
            count1 = count1+1
words = encode_words(extract_adjective(train_news))    
#print(words)

#print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
#print(count)

#print(count1)
labels = set_label(fake_size, real_size)
#converted = convert_matrix(words)
#print(len(converted))
appended = append_with_max_len(words,len(max(words, key=len)))
#print(appended)
predict_words = encode_words(predict_news)
classif = OneVsRestClassifier(estimator=SVC(random_state=0))
#classif = tree.DecisionTreeClassifier()
#classif = RandomForestClassifier(max_depth=2, random_state=0)
predict_words = encode_words(extract_adjective(predict_news))
item = classif.fit(appended, labels)
#print(item)
item.predict(append_with_max_len(predict_words,len(max(words, key=len))))

KeyError: ''