In [2]:
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
from copy import deepcopy
import gensim
import os
from keras.preprocessing.sequence import pad_sequences
import json
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
def str_to_wordlist(tweettext, tweet, remove_stopwords=False):
    str_text = re.sub("[^a-zA-Z]", " ", tweettext)
    words = nltk.word_tokenize(str_text.lower())
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if w not in stops]
    return(words)

def loadW2vModel():
    # LOAD PRETRAINED MODEL
    global model_GN
    print ("Loading the model")
    model_GN = gensim.models.KeyedVectors.load_word2vec_format(
                    '/content/drive/MyDrive/CAPSTONE PROJECT - PRAJNA SIRISHA SUKANYA/code/Stance_Veracity/GoogleNews-vectors-negative300.bin', binary=True)
    print ("Done!")

def sumw2v(tweet, avg=True):
    global model_GN
    model = model_GN
    num_features = 300
    temp_rep = np.zeros(num_features)
    wordlist = str_to_wordlist(tweet['text'], tweet, remove_stopwords=False)
    for w in range(len(wordlist)):
        if wordlist[w] in model:
            temp_rep += model[wordlist[w]]
    if avg and len(wordlist) != 0:
        sumw2v = temp_rep/len(wordlist)
    else:
        sumw2v = temp_rep
    return sumw2v

def getW2vCosineSimilarity(words, wordssrc):
    global model_GN
    model = model_GN
    words2 = []
    for word in words:
        if word in model.wv.vocab:  # change to model.wv.vocab
            words2.append(word)
    wordssrc2 = []
    for word in wordssrc:
        if word in model.wv.vocab:  # change to model.wv.vocab
            wordssrc2.append(word)
    if len(words2) > 0 and len(wordssrc2) > 0:
        return model.n_similarity(words2, wordssrc2)
    return 0

In [4]:
from copy import deepcopy


def tree2branches(root):
    node = root
    parent_tracker = []
    parent_tracker.append(root)
    branch = []
    branches = []
    i = 0
    siblings = None
    while True:
        node_name = list(node.keys())[i]
        branch.append(node_name)
        # get children of the node
        # actually all chldren, all tree left under this node
        first_child = list(node.values())[i]
        if first_child != []:  # if node has children
            node = first_child  # walk down
            parent_tracker.append(node)
            siblings = list(first_child.keys())
            i = 0  # index of a current node
        else:
            branches.append(deepcopy(branch))
            if siblings is not None:
                i = siblings.index(node_name)  # index of a current node
                # if node doesnt have next siblings
                while i+1 >= len(siblings):
                    if node is parent_tracker[0]:  # if it is a root node
                        return branches
                    del parent_tracker[-1]
                    del branch[-1]
                    node = parent_tracker[-1]  # walk up ... one step
                    node_name = branch[-1]
                    siblings = list(node.keys())
                    i = siblings.index(node_name)
                i = i+1    # ... walk right
    #            node =  parent_tracker[-1].values()[i]
                del branch[-1]
            else:
                return branches

In [5]:
def transform_feature_dict(thread_feature_dict, conversation, feature_set):
    thread_features_array = []
    # thread_stance_labels = []
    clean_branches = []

    branches = conversation['branches']

    for branch in branches:
        branch_rep = []
        clb = []
        # branch_stance_lab = []
        for twid in branch:
            if twid in thread_feature_dict.keys():
                tweet_rep = dict_to_array(thread_feature_dict[twid],
                                          feature_set)
                branch_rep.append(tweet_rep)

                if twid == branch[0]:
                    # if 'label' in list(conversation['source'].keys()):
                    #     branch_stance_lab.append(convert_label(
                    #                         conversation['source']['label']))
                    clb.append(twid)
                else:
                    for r in conversation['replies']:
                        if r['id_str'] == twid:
                            # if 'label' in list(r.keys()):

                            #     branch_stance_lab.append(
                            #                         convert_label(r['label']))
                            clb.append(twid)
        if branch_rep != []:
            branch_rep = np.asarray(branch_rep)
            # branch_stance_lab = np.asarray(branch_stance_lab)
            thread_features_array.append(branch_rep)
            # thread_stance_labels.append(branch_stance_lab)
            clean_branches.append(clb)
     
    return thread_features_array, clean_branches

#%%
def dict_to_array(feature_dict, feature_set):

    tweet_rep = []
    for feature_name in feature_set:

        if np.isscalar(feature_dict[feature_name]):
            tweet_rep.append(feature_dict[feature_name])
        else:
            tweet_rep.extend(feature_dict[feature_name])
    tweet_rep = np.asarray(tweet_rep)
    return tweet_rep

In [6]:
"""
Function in this file extracts features from conversations(threads) and returns
dictionary with features
"""

def extract_thread_features(conversation):
    feature_dict = {}

    tw = conversation['src']
    print(tw)
    tokens = nltk.word_tokenize(re.sub(r'([^\s\w]|_)+', '',
                                       tw['text'].lower()))

    otherthreadtweets = ''
    for response in conversation['replies']:
        
        otherthreadtweets += ' ' + response['text']
            
    otherthreadtokens = nltk.word_tokenize(re.sub(r'([^\s\w]|_)+', '',
                                                  otherthreadtweets.lower()))


    raw_txt = tw['text']
    feature_dict['hasqmark'] = 0
    if tw['text'].find('?') >= 0:
        feature_dict['hasqmark'] = 1
    feature_dict['hasemark'] = 0
    if tw['text'].find('!') >= 0:
        feature_dict['hasemark'] = 1
    feature_dict['hasperiod'] = 0
    if tw['text'].find('.') >= 0:
        feature_dict['hasperiod'] = 1
    feature_dict['hashashtag'] = 0
    if tw['text'].find('#') >= 0:
        feature_dict['hashashtag'] = 1
    feature_dict['hasurl'] = 0
    if tw['text'].find('urlurlurl') >= 0 or tw['text'].find('http') >= 0:
        feature_dict['hasurl'] = 1
    feature_dict['haspic'] = 0
    if (tw['text'].find('picpicpic') >= 0) or (
            tw['text'].find('pic.twitter.com') >= 0) or (
            tw['text'].find('instagr.am') >= 0):
        feature_dict['haspic'] = 1
    feature_dict['hasnegation'] = 0
    negationwords = ['not', 'no', 'nobody', 'nothing', 'none', 'never',
                     'neither', 'nor', 'nowhere', 'hardly', 'scarcely',
                     'barely', 'don', 'isn', 'wasn', 'shouldn', 'wouldn',
                     'couldn', 'doesn']
    for negationword in negationwords:
        if negationword in tokens:
            feature_dict['hasnegation'] += 1
    feature_dict['charcount'] = len(tw['text'])
    feature_dict['wordcount'] = len(nltk.word_tokenize(re.sub(r'([^\s\w]|_)+',
                                    '', tw['text'].lower())))
    swearwords = []
    with open('/content/drive/MyDrive/CAPSTONE PROJECT - PRAJNA SIRISHA SUKANYA/code/badwords.txt', 'r') as f:
        for line in f:
            swearwords.append(line.strip().lower())
    feature_dict['hasswearwords'] = 0
    for token in tokens:
        if token in swearwords:
            feature_dict['hasswearwords'] += 1
    uppers = [l for l in raw_txt if l.isupper()]
    feature_dict['capitalratio'] = float(len(uppers))/len(raw_txt)
    feature_dict['Word2VecSimilarityWrtOther'] = getW2vCosineSimilarity(
                                                 tokens, otherthreadtokens)
    feature_dict['avgw2v'] = sumw2v(tw, avg=True)
    postag_tuples = nltk.pos_tag(tokens)
    postag_list = [x[1] for x in postag_tuples]
    possible_postags = ['WRB', 'WP$', 'WP', 'WDT', 'VBZ', 'VBP', 'VBN', 'VBG',
                        'VBD', 'VB', 'UH', 'TO', 'SYM', 'RP', 'RBS', 'RBR',
                        'RB', 'PRP$', 'PRP',  'POS', 'PDT', 'NNS', 'NNPS',
                        'NNP', 'NN', 'MD', 'LS', 'JJS', 'JJR', 'JJ', 'IN',
                        'FW', 'EX', 'DT', 'CD', 'CC']
    postag_binary = np.zeros(len(possible_postags))
    for tok in postag_list:
        postag_binary[possible_postags.index(tok)] = 1
    feature_dict['pos'] = postag_binary
    false_synonyms = ['false',  'bogus',  'deceitful',  'dishonest',
                      'distorted',  'erroneous',  'fake', 'fanciful',
                      'faulty',  'fictitious',  'fraudulent',
                      'improper',  'inaccurate',  'incorrect',
                      'invalid', 'misleading', 'mistaken', 'phony',
                      'specious', 'spurious', 'unfounded', 'unreal',
                      'untrue',  'untruthful',  'apocryphal',  'beguiling',
                      'casuistic',  'concocted', 'cooked-up',
                      'counterfactual', 'deceiving', 'delusive', 'ersatz',
                      'fallacious', 'fishy',  'illusive',  'imaginary',
                      'inexact',  'lying',  'mendacious',
                      'misrepresentative', 'off the mark', 'sham',
                      'sophistical', 'trumped up', 'unsound']
    false_antonyms = ['accurate', 'authentic', 'correct', 'fair', 'faithful',
                      'frank', 'genuine', 'honest', 'moral', 'open', 'proven',
                      'real', 'right', 'sincere', 'sound', 'true',
                      'trustworthy', 'truthful', 'valid', 'actual', 'factual',
                      'just', 'known', 'precise', 'reliable', 'straight',
                      'substantiated']
    feature_dict['src_num_false_synonyms'] = 0
    for token in tokens:
        if token in false_synonyms:
            feature_dict['src_num_false_synonyms'] += 1
    feature_dict['src_num_false_antonyms'] = 0
    for token in tokens:
        if token in false_antonyms:
            feature_dict['src_num_false_antonyms'] += 1
    feature_dict['thread_num_false_synonyms'] = 0
    for token in otherthreadtokens:
        if token in false_synonyms:
            feature_dict['thread_num_false_synonyms'] += 1
    feature_dict['thread_num_false_antonyms'] = 0
    for token in otherthreadtokens:
        if token in false_antonyms:
            feature_dict['thread_num_false_antonyms'] += 1
    feature_dict['src_unconfirmed'] = 0
    feature_dict['src_rumour'] = 0
    feature_dict['thread_unconfirmed'] = 0
    feature_dict['thread_rumour'] = 0
    if 'unconfirmed' in tokens:
        feature_dict['src_unconfirmed'] = 1
    if 'unconfirmed' in otherthreadtokens:
        feature_dict['thread_unconfirmed'] = 1
    if 'rumour' in tokens or 'gossip' in tokens or 'hoax' in tokens:
        feature_dict['src_rumour'] = 1
    if ('rumour' in otherthreadtokens) or ('gossip' in otherthreadtokens) or (
                                                'hoax' in otherthreadtokens):
        feature_dict['thread_rumour'] = 1
    whwords = ['what', 'when', 'where', 'which', 'who', 'whom', 'whose', 'why',
               'how']
    feature_dict['src_num_wh'] = 0
    for token in tokens:
        if token in whwords:
            feature_dict['src_num_wh'] += 1
    feature_dict['thread_num_wh'] = 0
    for token in otherthreadtokens:
        if token in whwords:
            feature_dict['thread_num_wh'] += 1
    SpeechAct = {}
    SpeechAct['SpeechAct_ORDER'] = ['command', 'demand', 'tell', 'direct',
                                    'instruct', 'require', 'prescribe',
                                    'order']
    SpeechAct['SpeechAct_ASK1'] = ['ask', 'request', 'beg', 'bespeech',
                                   'implore',
                                   'appeal', 'plead', 'intercede', 'apply',
                                   'urge', 'persuade', 'dissuade', 'convince']
    SpeechAct['SpeechAct_ASK2'] = ['ask', 'inquire', 'enquire', 'interrogate',
                                   'question', 'query']
    SpeechAct['SpeechAct_CALL'] = ['call', 'summon', 'invite', 'call on',
                                   'call for', 'order', 'book', 'reserve']
    SpeechAct['SpeechAct_FORBID'] = ['forbid', 'prohibit', 'veto', 'refuse',
                                     'decline', 'reject', 'rebuff', 'renounce',
                                     'cancel', 'resign', 'dismiss']
    SpeechAct['SpeechAct_PERMIT'] = ['permit', 'allow', 'consent', 'accept',
                                     'agree', 'approve', 'disapprove',
                                     'authorize', 'appoint']
    SpeechAct['SpeechAct_ARGUE'] = ['argue', 'disagree', 'refute', 'contradict',
                                    'counter', 'deny', 'recant', 'retort',
                                    'quarrel']
    SpeechAct['SpeechAct_REPRIMAND'] = ['reprimand', 'rebuke', 'reprove',
                                        'admonish', 'reproach', 'nag',
                                        'scold', 'abuse', 'insult']
    SpeechAct['SpeechAct_MOCK'] = ['ridicule', 'joke']
    SpeechAct['SpeechAct_BLAME'] = ['blame', 'criticize', 'condemn',
                                    'denounce', 'deplore', 'curse']
    SpeechAct['SpeechAct_ACCUSE'] = ['accuse', 'charge', 'challenge', 'defy',
                                     'dare']
    SpeechAct['SpeechAct_ATTACK'] = ['attack', 'defend']
    SpeechAct['SpeechAct_WARN '] = ['warn', 'threaten', 'blackmail']
    SpeechAct['SpeechAct_ADVISE '] = ['advise', 'councel', 'consult',
                                      'recommend', 'suggest', 'propose',
                                      'advocate']
    SpeechAct['SpeechAct_OFFER '] = ['offer', 'volunteer', 'grant', 'give']
    SpeechAct['SpeechAct_PRAISE '] = ['praise', 'commend', 'compliment',
                                      'boast', 'credit']
    SpeechAct['SpeechAct_PROMISE '] = ['promise', 'pledge', 'vow', 'swear',
                                       'vouch for', 'guarante']
    SpeechAct['SpeechAct_THANK '] = ['thank', 'apologise', 'greet', 'welcome',
                                     'farewell', 'goodbye', 'introduce',
                                     'bless', 'wish', 'congratulate']
    SpeechAct['SpeechAct_FORGIVE '] = ['forgive', 'excuse', 'justify',
                                       'absolve', 'pardon', 'convict',
                                       'acquit', 'sentence']
    SpeechAct['SpeechAct_COMPLAIN'] = ['complain', 'protest', 'object',
                                       'moan', 'bemoan', 'lament', 'bewail']
    SpeechAct['SpeechAct_EXCLAIM'] = ['exclaim', 'enthuse', 'exult', 'swear',
                                      'blaspheme']
    SpeechAct['SpeechAct_GUESS'] = ['guess', 'bet', 'presume', 'suspect',
                                    'suppose', 'wonder', 'speculate',
                                    'conjecture', 'predict', 'forecast',
                                    'prophesy']
    SpeechAct['SpeechAct_HINT'] = ['hint', 'imply', 'insinuate']
    SpeechAct['SpeechAct_CONCLUDE'] = ['conclude', 'deduce', 'infer', 'gather',
                                       'reckon', 'estimate', 'calculate',
                                       'count', 'prove', 'compare']
    SpeechAct['SpeechAct_TELL'] = ['tell', 'report', 'narrate', 'relate',
                                   'recount', 'describe', 'explain', 'lecture']
    SpeechAct['SpeechAct_INFORM'] = ['inform', 'notify', 'announce',
                                     'inform on', 'reveal']
    SpeechAct['SpeechAct_SUMUP'] = ['sum up', 'summarize', 'recapitulate']
    SpeechAct['SpeechAct_ADMIT'] = ['admit', 'acknowledge', 'concede',
                                    'confess', 'confide']
    SpeechAct['SpeechAct_ASSERT'] = ['assert', 'affirm', 'claim', 'maintain',
                                     'contend', 'state', 'testify']
    SpeechAct['SpeechAct_CONFIRM'] = ['confirm', 'assure', 'reassure']
    SpeechAct['SpeechAct_STRESS'] = ['stress', 'emphasize', 'insist', 'repeat',
                                     'point out', 'note', 'remind', 'add']
    SpeechAct['SpeechAct_DECLARE'] = ['declare', 'pronounce', 'proclaim',
                                      'decree', 'profess', 'vote', 'resolve',
                                      'decide']
    SpeechAct['SpeechAct_BAPTIZE'] = ['baptize', 'chirsten', 'name',
                                      'excommunicate']
    SpeechAct['SpeechAct_REMARK'] = ['remark', 'comment', 'observe']
    SpeechAct['SpeechAct_ANSWER'] = ['answer', 'reply']
    SpeechAct['SpeechAct_DISCUSS'] = ['discuss', 'debate', 'negotiate',
                                      'bargain']
    SpeechAct['SpeechAct_TALK'] = ['talk', 'converse', 'chat', 'gossip']
    for k in SpeechAct.keys():
        feature_dict[k] = 0
        for verb in SpeechAct[k]:
            if verb in tw['text'].lower():
                feature_dict[k] += 1
    for k in SpeechAct.keys():
        feature_dict['thread_'+k] = 0
        for verb in SpeechAct[k]:
            for resptw in conversation['replies']:
                if verb in resptw['text'].lower():
                    feature_dict['thread_'+k] += 1

    return feature_dict


def extract_thread_features_incl_response(conversation):
    source_features = extract_thread_features(conversation)
    source_features['issource'] = 1
    source_features['Word2VecSimilarityWrtSource'] = 0
    source_features['Word2VecSimilarityWrtPrev'] = 0
    srctokens = nltk.word_tokenize(re.sub(
                                   r'([^\s\w]|_)+', '',
                                   conversation['src']['text'].lower()))
    fullthread_featdict = {}
    fullthread_featdict[conversation['src']['id_str']] = source_features
    
    for tw in conversation['replies']:
        feature_dict = {}
        feature_dict['issource'] = 0
        tokens = nltk.word_tokenize(re.sub(r'([^\s\w]|_)+', '',
                                           tw['text'].lower()))
        otherthreadtweets = ''
        otherthreadtweets += conversation['src']['text'] 
   
            
        for response in conversation['replies']:
            otherthreadtweets += ' ' + response['text']
                
        otherthreadtokens = nltk.word_tokenize(re.sub(
                                               r'([^\s\w]|_)+', '',
                                               otherthreadtweets.lower()))
        branches = tree2branches(conversation['structure'])
        for branch in branches:
            if tw['id_str'] in branch:
                if branch.index(tw['id_str'])-1 == 0:
                    prevtokens = srctokens
                else:
                    prev_id = branch[branch.index(tw['id_str'])-1]
                    for ptw in conversation['replies']:
                        if ptw['id_str'] == prev_id:
                            prevtokens = nltk.word_tokenize(re.sub(
                                                       r'([^\s\w]|_)+',
                                                       '',
                                                       ptw['text'].lower()))
                            break
            else:
                prevtokens = []
            break
        raw_txt = tw['text']
        feature_dict['hasqmark'] = 0
        if tw['text'].find('?') >= 0:
            feature_dict['hasqmark'] = 1
        feature_dict['hasemark'] = 0
        if tw['text'].find('!') >= 0:
            feature_dict['hasemark'] = 1
        feature_dict['hasperiod'] = 0
        if tw['text'].find('.') >= 0:
            feature_dict['hasperiod'] = 1
        feature_dict['hashashtag'] = 0
        if tw['text'].find('#') >= 0:
            feature_dict['hashashtag'] = 1
        feature_dict['hasurl'] = 0
        if tw['text'].find('urlurlurl') >= 0 or tw['text'].find('http') >= 0:
            feature_dict['hasurl'] = 1
        feature_dict['haspic'] = 0
        if (tw['text'].find('picpicpic') >= 0) or (
                tw['text'].find('pic.twitter.com') >= 0) or (
                tw['text'].find('instagr.am') >= 0):
                feature_dict['haspic'] = 1
        feature_dict['hasnegation'] = 0
        negationwords = ['not', 'no', 'nobody', 'nothing', 'none', 'never',
                         'neither', 'nor', 'nowhere', 'hardly', 'scarcely',
                         'barely', 'don', 'isn', 'wasn', 'shouldn', 'wouldn',
                         'couldn', 'doesn']
        for negationword in negationwords:
            if negationword in tokens:
                feature_dict['hasnegation'] += 1
        feature_dict['charcount'] = len(tw['text'])
        feature_dict['wordcount'] = len(nltk.word_tokenize(re.sub(
                                                         r'([^\s\w]|_)+',
                                                         '',
                                                         tw['text'].lower())))
        swearwords = []
        with open('/content/drive/MyDrive/CAPSTONE PROJECT - PRAJNA SIRISHA SUKANYA/code/badwords.txt', 'r') as f:
            for line in f:
                swearwords.append(line.strip().lower())
        feature_dict['hasswearwords'] = 0
        for token in tokens:
            if token in swearwords:
                feature_dict['hasswearwords'] += 1
        uppers = [l for l in raw_txt if l.isupper()]
        
        l = len(raw_txt)
        if l!=0:
            feature_dict['capitalratio'] = float(len(uppers))/l
        else:
            feature_dict['capitalratio'] = 0
        feature_dict['Word2VecSimilarityWrtOther'] = getW2vCosineSimilarity(
                                                     tokens, otherthreadtokens)
        feature_dict['Word2VecSimilarityWrtSource'] = getW2vCosineSimilarity(
                                                             tokens, srctokens)
        feature_dict['Word2VecSimilarityWrtPrev'] = getW2vCosineSimilarity(
                                                            tokens, prevtokens)
        feature_dict['avgw2v'] =sumw2v(
                                                        tw,
                                                        avg=True)

        feature_dict['src_usr_hasurl'] = 0

        postag_tuples = nltk.pos_tag(tokens)
        postag_list = [x[1] for x in postag_tuples]
        possible_postags = ['WRB', 'WP$', 'WP', 'WDT', 'VBZ', 'VBP', 'VBN',
                            'VBG', 'VBD', 'VB', 'UH', 'TO', 'SYM', 'RP', 'RBS',
                            'RBR', 'RB', 'PRP$', 'PRP',  'POS', 'PDT', 'NNS',
                            'NNPS', 'NNP', 'NN', 'MD', 'LS', 'JJS', 'JJR',
                            'JJ', 'IN', 'FW', 'EX', 'DT', 'CD', 'CC', '$']
        postag_binary = np.zeros(len(possible_postags))
        for tok in postag_list:
            if tok in possible_postags:
                postag_binary[possible_postags.index(tok)] = 1
        feature_dict['pos'] = postag_binary
        false_synonyms = ['false', 'bogus', 'deceitful', 'dishonest',
                          'distorted', 'erroneous', 'fake', 'fanciful',
                          'faulty', 'fictitious', 'fraudulent',
                          'improper', 'inaccurate', 'incorrect',
                          'invalid', 'misleading', 'mistaken', 'phony',
                          'specious', 'spurious', 'unfounded', 'unreal',
                          'untrue',  'untruthful', 'apocryphal',
                          'beguiling', 'casuistic', 'concocted',
                          'cooked-up', 'counterfactual',
                          'deceiving', 'delusive', 'ersatz',
                          'fallacious', 'fishy', 'illusive', 'imaginary',
                          'inexact', 'lying', 'mendacious',
                          'misrepresentative', 'off the mark', 'sham',
                          'sophistical', 'trumped up', 'unsound']
        false_antonyms = ['accurate', 'authentic', 'correct', 'fair',
                          'faithful', 'frank', 'genuine', 'honest', 'moral',
                          'open', 'proven', 'real', 'right', 'sincere',
                          'sound', 'true', 'trustworthy', 'truthful',
                          'valid', 'actual', 'factual', 'just', 'known',
                          'precise', 'reliable', 'straight', 'substantiated']
        feature_dict['src_num_false_synonyms'] = 0
        for token in tokens:
            if token in false_synonyms:
                feature_dict['src_num_false_synonyms'] += 1
        feature_dict['src_num_false_antonyms'] = 0
        for token in tokens:
            if token in false_antonyms:
                feature_dict['src_num_false_antonyms'] += 1
        feature_dict['thread_num_false_synonyms'] = 0
        for token in otherthreadtokens:
            if token in false_synonyms:
                feature_dict['thread_num_false_synonyms'] += 1
        feature_dict['thread_num_false_antonyms'] = 0
        for token in otherthreadtokens:
            if token in false_antonyms:
                feature_dict['thread_num_false_antonyms'] += 1
        feature_dict['src_unconfirmed'] = 0
        feature_dict['src_rumour'] = 0
        feature_dict['thread_unconfirmed'] = 0
        feature_dict['thread_rumour'] = 0
        if 'unconfirmed' in tokens:
            feature_dict['src_unconfirmed'] = 1
        if 'unconfirmed' in otherthreadtokens:
            feature_dict['thread_unconfirmed'] = 1
        if 'rumour' in tokens or 'gossip' in tokens or 'hoax' in tokens:
            feature_dict['src_rumour'] = 1
        if ('rumour' in otherthreadtokens) or (
                                            'gossip' in otherthreadtokens) or (
                                            'hoax' in otherthreadtokens):
            feature_dict['thread_rumour'] = 1
        whwords = ['what', 'when', 'where', 'which', 'who', 'whom', 'whose',
                   'why', 'how']
        feature_dict['src_num_wh'] = 0
        for token in tokens:
            if token in whwords:
                feature_dict['src_num_wh'] += 1
        feature_dict['thread_num_wh'] = 0
        for token in otherthreadtokens:
            if token in whwords:
                feature_dict['thread_num_wh'] += 1
        SpeechAct = {}
        SpeechAct['SpeechAct_ORDER'] = ['command', 'demand', 'tell', 'direct',
                                        'instruct', 'require', 'prescribe',
                                        'order']
        SpeechAct['SpeechAct_ASK1'] = ['ask', 'request', 'beg', 'bespeech',
                                       'implore', 'appeal', 'plead',
                                       'intercede', 'apply', 'urge',
                                       'persuade', 'dissuade', 'convince']
        SpeechAct['SpeechAct_ASK2'] = ['ask', 'inquire', 'enquire',
                                       'interrogate', 'question', 'query']
        SpeechAct['SpeechAct_CALL'] = ['call', 'summon', 'invite', 'call on',
                                       'call for', 'order', 'book', 'reserve']
        SpeechAct['SpeechAct_FORBID'] = ['forbid', 'prohibit', 'veto',
                                         'refuse', 'decline', 'reject',
                                         'rebuff', 'renounce', 'cancel',
                                         'resign', 'dismiss']
        SpeechAct['SpeechAct_PERMIT'] = ['permit', 'allow', 'consent',
                                         'accept', 'agree', 'approve',
                                         'disapprove', 'authorize', 'appoint']
        SpeechAct['SpeechAct_ARGUE'] = ['argue', 'disagree', 'refute',
                                        'contradict', 'counter', 'deny',
                                        'recant', 'retort', 'quarrel']
        SpeechAct['SpeechAct_REPRIMAND'] = ['reprimand', 'rebuke', 'reprove',
                                            'admonish', 'reproach', 'nag',
                                            'scold', 'abuse', 'insult']
        SpeechAct['SpeechAct_MOCK'] = ['ridicule', 'joke']
        SpeechAct['SpeechAct_BLAME'] = ['blame', 'criticize', 'condemn',
                                        'denounce', 'deplore', 'curse']
        SpeechAct['SpeechAct_ACCUSE'] = ['accuse', 'charge', 'challenge',
                                         'defy', 'dare']
        SpeechAct['SpeechAct_ATTACK'] = ['attack', 'defend']
        SpeechAct['SpeechAct_WARN '] = ['warn', 'threaten', 'blackmail']
        SpeechAct['SpeechAct_ADVISE '] = ['advise', 'councel', 'consult',
                                          'recommend', 'suggest', 'propose',
                                          'advocate']
        SpeechAct['SpeechAct_OFFER '] = ['offer', 'volunteer', 'grant', 'give']
        SpeechAct['SpeechAct_PRAISE '] = ['praise', 'commend', 'compliment',
                                          'boast', 'credit']
        SpeechAct['SpeechAct_PROMISE '] = ['promise', 'pledge', 'vow', 'swear',
                                           'vouch for', 'guarante']
        SpeechAct['SpeechAct_THANK '] = ['thank', 'apologise', 'greet',
                                         'welcome', 'farewell', 'goodbye',
                                         'introduce', 'bless', 'wish',
                                         'congratulate']
        SpeechAct['SpeechAct_FORGIVE '] = ['forgive', 'excuse', 'justify',
                                           'absolve', 'pardon', 'convict',
                                           'acquit', 'sentence']
        SpeechAct['SpeechAct_COMPLAIN'] = ['complain', 'protest', 'object',
                                           'moan', 'bemoan', 'lament',
                                           'bewail']
        SpeechAct['SpeechAct_EXCLAIM'] = ['exclaim', 'enthuse', 'exult',
                                          'swear', 'blaspheme']
        SpeechAct['SpeechAct_GUESS'] = ['guess', 'bet', 'presume', 'suspect',
                                        'suppose', 'wonder', 'speculate',
                                        'conjecture', 'predict', 'forecast',
                                        'prophesy']
        SpeechAct['SpeechAct_HINT'] = ['hint', 'imply', 'insinuate']
        SpeechAct['SpeechAct_CONCLUDE'] = ['conclude', 'deduce', 'infer',
                                           'gather', 'reckon', 'estimate',
                                           'calculate', 'count', 'prove',
                                           'compare']
        SpeechAct['SpeechAct_TELL'] = ['tell', 'report', 'narrate', 'relate',
                                       'recount', 'describe', 'explain',
                                       'lecture']
        SpeechAct['SpeechAct_INFORM'] = ['inform', 'notify', 'announce',
                                         'inform on', 'reveal']
        SpeechAct['SpeechAct_SUMUP'] = ['sum up', 'summarize', 'recapitulate']
        SpeechAct['SpeechAct_ADMIT'] = ['admit', 'acknowledge', 'concede',
                                        'confess', 'confide']
        SpeechAct['SpeechAct_ASSERT'] = ['assert', 'affirm', 'claim',
                                         'maintain', 'contend',
                                         'state', 'testify']
        SpeechAct['SpeechAct_CONFIRM'] = ['confirm', 'assure', 'reassure']
        SpeechAct['SpeechAct_STRESS'] = ['stress', 'emphasize', 'insist',
                                         'repeat', 'point out', 'note',
                                         'remind', 'add']
        SpeechAct['SpeechAct_DECLARE'] = ['declare', 'pronounce', 'proclaim',
                                          'decree', 'profess', 'vote',
                                          'resolve', 'decide']
        SpeechAct['SpeechAct_BAPTIZE'] = ['baptize', 'chirsten', 'name',
                                          'excommunicate']
        SpeechAct['SpeechAct_REMARK'] = ['remark', 'comment', 'observe']
        SpeechAct['SpeechAct_ANSWER'] = ['answer', 'reply']
        SpeechAct['SpeechAct_DISCUSS'] = ['discuss', 'debate', 'negotiate',
                                          'bargain']
        SpeechAct['SpeechAct_TALK'] = ['talk', 'converse', 'chat', 'gossip']
        for k in SpeechAct.keys():
            feature_dict[k] = 0
            for verb in SpeechAct[k]:
                if verb in tw['text'].lower():
                    feature_dict[k] += 1
                    
        fullthread_featdict[tw['id_str']] = feature_dict     
    return fullthread_featdict


In [7]:
def load_dataset():

    # Load labels and split for task A and task B
    # tweet_label_dict, veracity_label_dict = load_true_labels()
    # dev = tweet_label_dict['dev']
    # train = tweet_label_dict['train']
    # dev_tweets = dev.keys()
    # train_tweets = train.keys()
    # Load folds and conversations
    # path_to_folds = '../rumoureval-2019-training-data/twitter-english'
    path_to_folds = '/content/drive/MyDrive/CAPSTONE PROJECT - PRAJNA SIRISHA SUKANYA/demo/Tweet_data/ottawa'
    folds = sorted(os.listdir(path_to_folds))
    newfolds = [i for i in folds if i[0] != '.']
    folds = newfolds
    cvfolds = {}
    allconv = []
    train_dev_split = {}
    # train_dev_split['dev'] = []
    # train_dev_split['train'] = []
    train_dev_split['test'] = []
    # for nfold, fold in enumerate(folds):
        # path_to_tweets = os.path.join(path_to_folds, newfolds[0])
        # tweet_data = sorted(os.listdir(path_to_tweets))
        # newfolds = [i for i in tweet_data if i[0] != '.']
        # tweet_data = newfolds
    conversation = {}
    for foldr in folds:
        flag = 0
        conversation['id'] = foldr
        tweets = []
        path_repl = path_to_folds+'/'+foldr+'/replies'
        files_t = sorted(os.listdir(path_repl))
        newfolds = [i for i in files_t if i[0] != '.']
        files_t = newfolds
        if files_t!=[]:
            for repl_file in files_t:
                with open(os.path.join(path_repl, repl_file)) as f:
                    for line in f:
                        tw = json.loads(line)
                        # tw = json.loads(line)
                        tw['used'] = 0
                        replyid = tw['id_str']
                        # if replyid in dev_tweets:
                        tw['set'] = 'test'
                        # tw['label'] = dev[replyid]
                        # train_dev_split['test'].append(tw)
                        if flag == 'train':
                            print ("The tree is split between sets", foldr)
                        flag='test'
    #                     elif replyid in train_tweets:
    #                         tw['set'] = 'train'
    #                         # tw['label'] = train[replyid]
    # #                        train_dev_tweets['train'].append(tw)
    #                         if flag == 'dev':
    #                             print ("The tree is split between sets", foldr)
    #                         flag='train'
                        tweets.append(tw)
                        if tw['text'] is None:
                            print ("Tweet has no text", tw['id'])
            conversation['replies'] = tweets

            path_src = path_to_folds+'/'+foldr+'/source-tweet'
            files_t = sorted(os.listdir(path_src))
            with open(os.path.join(path_src, files_t[0])) as f:
                    for line in f:
                        src = json.loads(line)
                        src['used'] = 0
                        scrcid = src['id_str']
                        src['set'] = flag
                        # src['label'] = tweet_label_dict[flag][scrcid]

            conversation['src'] = src
            print(src)
            # conversation['veracity'] = veracity_label_dict[flag][scrcid]
            if src['text'] is None:
                print ("Tweet has no text", src['id'])
            path_struct = path_to_folds+'/'+foldr+'/structure.json'
            with open(path_struct) as f:
                    for line in f:
                        struct = json.loads(line)
            if len(struct) > 1:
                # I had to alter the structure of this conversation
                # if foldr=='553480082996879360':
                #     new_struct = {}
                #     new_struct[foldr] = struct[foldr]
                #     new_struct[foldr]['553495625527209985'] = struct['553485679129534464']['553495625527209985']
                #     new_struct[foldr]['553495937432432640'] = struct['553490097623269376']['553495937432432640']
                #     struct = new_struct
                # else:
                  new_struct = {}
                  new_struct[foldr] = struct[foldr]
                  struct = new_struct
                # Take item from structure if key is same as source tweet id
            conversation['structure'] = struct

            branches = tree2branches(conversation['structure'])
            conversation['branches'] = branches
            train_dev_split[flag].append(conversation.copy())
            allconv.append(conversation.copy())
            print(train_dev_split)
        else:
            flag='test'
            path_src = path_to_folds+'/'+foldr+'/source-tweet'
            files_t = sorted(os.listdir(path_src))
            with open(os.path.join(path_src, files_t[0])) as f:
                    for line in f:
                        src = json.loads(line)
                        src['used'] = 0
                        scrcid = src['id_str']
                        src['set'] = flag
                        # src['label'] = tweet_label_dict[flag][scrcid]

            conversation['src'] = src
            print(src)
            # conversation['veracity'] = veracity_label_dict[flag][scrcid]
            if src['text'] is None:
                print ("Tweet has no text", src['id'])
            
            path_struct = path_to_folds+'/'+foldr+'/structure.json'
            with open(path_struct) as f:
                    for line in f:
                        struct = json.loads(line)
            if len(struct) > 1:
                # print "Structure has more than one root"
                new_struct = {}
                new_struct[foldr] = struct[foldr]
                struct = new_struct
                # Take item from structure if key is same as source tweet id
            conversation['structure'] = struct
            branches = tree2branches(conversation['structure'])
            
            conversation['branches'] = branches
            train_dev_split[flag].append(conversation.copy())
            allconv.append(conversation.copy())
            
            print(conversation)
            
    cvfolds[foldr] = allconv
    allconv = []

    return train_dev_split

In [10]:
def prep_pipeline( feature_set=['avgw2v']):
    
    path = '/content/drive/MyDrive/CAPSTONE PROJECT - PRAJNA SIRISHA SUKANYA/demo'
    folds = {}
    folds = load_dataset()
    print(folds)

    loadW2vModel()

#%%
    for fold in folds.keys():
        
        # print(fold)
        feature_fold = []
        tweet_ids = []
        # fold_stance_labels = []
        # labels = []
        ids = []
        for conversation in folds[fold]:
            print(conversation)
            
            thread_feature_dict = extract_thread_features_incl_response(conversation)

            thread_features_array, branches = transform_feature_dict(
                                   thread_feature_dict, conversation,
                                   feature_set=feature_set)
            

            
            # fold_stance_labels.extend(thread_stance_labels)
            tweet_ids.extend(branches)
            feature_fold.extend(thread_features_array)
            for i in range(len(thread_features_array)):
                # labels.append(convert_label(conversation['veracity']))
                ids.append(conversation['id'])
                
#%
        if feature_fold!=[]:

            feature_fold = pad_sequences(feature_fold, maxlen=None,
                                         dtype='float32',
                                         padding='post',
                                         truncating='post', value=0.)
    
            # fold_stance_labels = pad_sequences(fold_stance_labels, maxlen=None,
            #                                    dtype='float32',
            #                                    padding='post', truncating='post',
            #                                    value=0.)

            # labels = np.asarray(labels)
            path_fold = os.path.join(path, fold)
            if not os.path.exists(path_fold):
                os.makedirs(path_fold)
    
            np.save(os.path.join(path_fold, 'test__array'), feature_fold)
            print(path_fold)
            # np.save(os.path.join(path_fold, 'labels'), labels)
            # np.save(os.path.join(path_fold, 'fold_stance_labels'),
            #         fold_stance_labels)
            np.save(os.path.join(path_fold, 'ids'), ids) 
            np.save(os.path.join(path_fold, 'tweet_ids'), tweet_ids)
                
#%%        
def main(data ='RumEval2019', feats = 'SemEvalfeatures'):
  
    if feats == 'text':
        prep_pipeline(feature_set=['avgw2v'])
    elif feats == 'SemEvalfeatures':
        SemEvalfeatures = ['avgw2v', 'hasnegation', 'hasswearwords',
                           'capitalratio', 'hasperiod', 'hasqmark',
                           'hasemark', 'hasurl', 'haspic',
                           'charcount', 'wordcount', 'issource',
                           'Word2VecSimilarityWrtOther',
                           'Word2VecSimilarityWrtSource',
                           'Word2VecSimilarityWrtPrev']
        prep_pipeline(feature_set=SemEvalfeatures)



if __name__ == '__main__':
    main()

{'contributors': None, 'truncated': False, 'text': 'BREAKING: Michael Zehaf-Bebeau had been designated "high-risk traveller" by CDN govt, which confiscated his passport http://t.co/dPeMQ78jzm', 'in_reply_to_status_id': None, 'id': 525032872647065600, 'favorite_count': 19, 'source': '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>', 'retweeted': False, 'coordinates': None, 'entities': {'user_mentions': [], 'symbols': [], 'trends': [], 'hashtags': [], 'urls': [{'url': 'http://t.co/dPeMQ78jzm', 'indices': [117, 139], 'expanded_url': 'http://www.theglobeandmail.com/news/national/parliament-shooting/article21217602/', 'display_url': 'theglobeandmail.com/news/national/…'}]}, 'in_reply_to_screen_name': None, 'id_str': '525032872647065600', 'retweet_count': 121, 'in_reply_to_user_id': None, 'favorited': False, 'user': {'follow_request_sent': None, 'profile_use_background_image': True, 'default_profile_image': False, 'id': 635608354, 'verified': False, 'profile_image_url_http



/content/drive/MyDrive/CAPSTONE PROJECT - PRAJNA SIRISHA SUKANYA/demo/test


  return array(a, dtype, copy=False, order=order, subok=True)


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
