In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import ftfy
import humanize
import datetime

In [2]:
with open('../Data/articles.pkl', 'rb') as f:
    data = pickle.load(f)
    
loaded_model = pickle.load(open('../Data/knn_model.p','rb'))
loaded_vectorizer = pickle.load(open('../Data/knn_vectorizer.p','rb'))

In [3]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

In [4]:
# Cleaning the text sentences so that punctuation marks, stop words &amp; digits are removed
def clean(doc):
    doc = ftfy.fix_text(doc)
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    processed = re.sub(r"\d+","", normalized)
    y = processed.split()
    return ' '.join(y)

In [10]:
def similar_journalists(text):
    input_text = [clean(text)]
    new = loaded_vectorizer.transform(input_text)
    results = loaded_model.kneighbors(new.todense())
    for i in range(5):
        index = results[1][0][i]
        print(data.iloc[index].clean_author.title())
#         print(data.iloc[index].full_text)
        print(data.iloc[index].topic)
        print(data.iloc[index].site_name)
        print(humanize.naturaltime(datetime.datetime.now() - datetime.datetime(*map(int, data.date[index].split('-')))))
        print()
    return results

In [11]:
similar_journalists("When everyone is fit in both squads? I think it’s very difficult to say, they’re both outstanding at creating and scoring goals, you probably couldn’t split them. But at the moment Liverpool certainly have the upper hand.")

Daniel Blackham
sport
Express.co.uk
3 months ago

Elle May Rice
sport
Liverpool Echo
2 months ago

Theo Squires
sport
Liverpool Echo
a month ago

Andrew Beasley
sport
Liverpool Echo
a month ago

Joe Bray
sport
Manchester Evening News
2 months ago



(array([[0.6756754 , 0.81357383, 0.8195141 , 0.82151046, 0.82367986]]),
 array([[ 91610, 118840,    602,  54486, 102547]], dtype=int64))

In [7]:
data

Unnamed: 0,author,article_count,site_name,topic,date,url,title,title_sentiment,description,full_text,clean_author,text
0,Aja Styles,14.0,Brisbane Times,entertainment,2020-12-23,https://www.brisbanetimes.com.au/national/west...,'Pack Lego': Perth family caught in hard borde...,-9.09,Perth mother Clare has found herself mostly co...,Perth mother Clare* has found herself mostly ...,aja styles,'Pack Lego': Perth family caught in hard borde...
1,Jake Johnson,33.0,Truthout,politics,2020-12-23,https://truthout.org/articles/congress-passes-...,Congress Passes COVID Relief With Billions in ...,18.18,The billâs gifts to the wealthy underscore t...,In late-night votes just hours after nearly 5...,jake johnson,Congress Passes COVID Relief With Billions in ...
2,Christine Favocci,19.0,The Western Journal,tech,2020-12-23,https://www.westernjournal.com/pa-man-facing-c...,PA Man Facing Charges of Unlawful Voting After...,-38.46,It is naive to think that either party is free...,The left has insisted that voter fraud is jus...,christine favocci,PA Man Facing Charges of Unlawful Voting After...
3,William Rivers Pitt,14.0,Truthout,politics,2020-12-23,https://truthout.org/articles/what-will-trump-...,What Will Trump Attempt in His Last Days? We M...,0.00,What Trump may do in his waning days is only u...,"The endgame being played out by Donald Trump,...",william rivers pitt,What Will Trump Attempt in His Last Days? We M...
4,Amy Goodman,19.0,Truthout,business,2020-12-23,https://truthout.org/video/the-insufficient-co...,The Insufficient COVID Stimulus Must Not Be Fo...,-20.00,Critics say the $900 billion relief package do...,As Congress passes a $900 billion coronavirus...,amy goodman,The Insufficient COVID Stimulus Must Not Be Fo...
...,...,...,...,...,...,...,...,...,...,...,...,...
119957,Olivia Tobin,56.0,Liverpool Echo,tech,2020-12-05,https://www.liverpoolecho.co.uk/news/liverpool...,"Boy, five, battling rare brain cancer will be ...",4.55,\n Five-year-old Aaron Wharton had surgery at ...,When you subscribe we will use the informatio...,olivia tobin,"Boy, five, battling rare brain cancer will be ..."
119958,Victoria Jones,92.0,WalesOnline,tech,2020-12-05,https://www.walesonline.co.uk/news/uk-news/how...,How to teach saving and spending to kids as yo...,0.00,Now might be a perfect time to involve childre...,When you subscribe we will use the informatio...,victoria jones,How to teach saving and spending to kids as yo...
119959,Victoria Jones,92.0,WalesOnline,tech,2020-12-05,https://www.walesonline.co.uk/news/uk-news/spa...,Space experiment could unlock resources for mi...,0.00,Experimenting on the ISS allows scientists to ...,When you subscribe we will use the informatio...,victoria jones,Space experiment could unlock resources for mi...
119960,Nisha Mal,66.0,WalesOnline,tech,2020-12-05,https://www.walesonline.co.uk/news/uk-news/wom...,Woman's home is in Tier 2 while her garden fal...,-10.00,"'It's all one big conundrum,' says Sheila Herbert",Woman's home is in Tier 2 while her garden fa...,nisha mal,Woman's home is in Tier 2 while her garden fal...
