In [None]:
import spacy
import pandas as pd
import numpy as np

from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

In [None]:
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))
wv = gensim.downloader.load('word2vec-google-news-300')

**Data Preprocessing**

In [None]:
#text summarization
def summarize(doc):
    keyword = []
    stopwords = list(STOP_WORDS)
    pos_tag = ('PROPN', 'ADJ', 'NOUN', 'VERB')
    for token in doc:
        if (token.text in stopwords or token.text in punctuation):
            continue
        if (token.pos_ in pos_tag):
            keyword.append(token.text)
    
    freq_word = Counter(keyword)
    max_freq = Counter(keyword).most_common(1)[0][1]
    for word in freq_word.keys():
        freq_word[word] = (freq_word[word]/max_freq)
    freq_word.most_common(5)
    

    sent_strength = {}
    for sent in doc.sents:
        for word in sent:
            if word.text in freq_word.keys():
                if sent in sent_strength.keys():
                    sent_strength[sent]+=freq_word[word.text]
                else:
                    sent_strength[sent]=freq_word[word.text]
    summarized_sents = nlargest(3, sent_strength, key=sent_strength.get)
    
    return summarized_sents[0].as_doc()

#text lemmatization
def lemmatize(text):
    return ' '.join([x.lemma_ for x in text])

#stopword removal
def stop_removal(text):
    nlp = spacy.load("en_core_web_sm")
    spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
    doc_summary = nlp(text)
    tmp_list = []
    for token in doc_summary:
        if not token.is_stop:
            tmp_list.append(token)
    without_stops = ''.join([token.text_with_ws for token in tmp_list])
    return without_stops

In [None]:
def preprocess(db_location="data\Fake.csv",limit=0,exportName="fake.txt"):
    
    df = pd.read_csv(db_location)

    #Stop:
    print("converting "+str(limit)+" out of "+str(len(df["text"])))

    i = 0
    cumulativeArticles = str("")
    for n in fdf["text"]:
        nlp = spacy.load("en_core_web_sm")
        n = nlp(n)
    
        n = summarize(n)
        n = lemmatize(n)
        n = stop_removal(n)

        cumulativeArticles += n
        print(str(i)+"/"+str(len(fdf["text"]))) 
        i +=1
        if (i > limit) and (limit !=0):
            break

    text_file = open(exportName, "w")
    text_file.write(cumulativeArticles)
    text_file.close()

In [None]:
preprocess("data\Fake.csv",0,"fake.txt")
preprocess("data\True.csv",0,"true.txt")

**Data Processing**

In [None]:
def get_subject_phrase(doc):
    for token in doc:
        if ("subj" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]
        
def get_object_phrase(doc):
    for token in doc:
        if ("dobj" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]

def get_verb_phrase(doc):
    for token in doc:
        if ("ROOT" in token.dep_):
            return token     
    
def get_whole_phrase(doc,index):
    if(doc[index].dep_ == "ROOT"):
        subtree = list(doc[index].subtree)
        start = subtree[0].i
        end = subtree[-1].i + 1
        return doc[start:end]

def docToSentences(doc): #list of sentences
    sentences = []
    
    for token in doc:
        if ("ROOT" in token.dep_):
            whole_phrase = get_whole_phrase(doc,token.i)
            subj_phrase = get_subject_phrase(whole_phrase)
            obj_phrase = get_object_phrase(whole_phrase)
            sentences.append([subj_phrase,token,obj_phrase])
    return sentences


def sentenceToParts(sentence): # list of parts of one sentence
    parts = []
    subj_phrase = get_subject_phrase(sentence)
    obj_phrase = get_object_phrase(sentence)
    verb = get_verb_phrase(sentence)
    
    return [subj_phrase,verb,obj_phrase]

In [None]:
def split_distance(word1,word2): #strings
    
    if((word1 not in wv.key_to_index) or (word2 not in wv.key_to_index) ):
        return 1
    return wv.distance(word1.text,word2.text)      

def partDistance(a,b):
    totalmin = 0    

    if(a == None or b == None):
        return 1 #nice number chosen by team
    
    if(type(a) == spacy.tokens.token.Token):
        a = [a]

    if(type(b) == spacy.tokens.token.Token):
        b = [b]

    for item_a in a:
        for item_b in b:
            localmin = 1
            if split_distance(item_a,item_b) < localmin:
                localmin = split_distance(item_a,item_b)
        totalmin += localmin
    return totalmin

def sentenceDistance(doc,doc2):
    
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(doc)
    doc2 = nlp(doc2)

    s1 = sentenceToParts(doc)
    s2 = sentenceToParts(doc2)
    
    totalDistance = 0
    for i in range(3):
        totalDistance += partDistance(s1[i],s2[i])
    return totalDistance/3

In [None]:
#Data Procesisng pipeline
from spacy.lang.en import English

def sentenceRatio(sentence,db): # two strings
    nlp = spacy.load("en_core_web_sm")
    nlp = English()
    nlp.add_pipe("sentencizer")    
    
    db = nlp(db)
    
    total = 0
    i = 0
    for sentenceFromdb in list(db.sents):
        sentenceFromdb = [s for s in str(sentenceFromdb) if s.isalnum() or s.isspace()]
        sentenceFromdb = "".join(sentenceFromdb)    
        
        total += sentenceDistance(sentence,sentenceFromdb)
        print(str(total)+" "+str(i)+"/"+str(len(list(db.sents))))
        i +=1
            

    return (total/len(list(db.sents)))


def articleRatio(article,db): #two strings
    nlp = spacy.load("en_core_web_sm")
    nlp = English()
    nlp.add_pipe("sentencizer")    
    
    article = nlp(article)    
    
    total = 0
    for sentence in list(article.sents):
        total += sentenceRatio(str(sentence),str(db))
        
    rvalue =  total / (len(list(article.sents)))
    return rvalue

                       
def start(fakedb,article):                       
    f = open(fakedb, "r")
    textfile = f.read()
    f.close()
    
    a = open(article, "r")
    articlefile = a.read()
    a.close()    

    ratio = articleRatio(article,fakedb)
    
    print(ratio)
    

In [None]:
start("fake.txt","testArticle.txt")