# My Sources Generation

In [5]:
from faker import Faker
fake = Faker(['en_AU', 'en_US', 'en_UK'])

In [6]:
import os
from dataclasses import dataclass, field
from typing import List
from pathlib import Path
from datetime import datetime
import random
import json
from pprint import pprint

from transformers import pipeline
from textblob import TextBlob
from dotenv import load_dotenv
import pymongo
import requests
import matplotlib.pyplot as plt
from tqdm import tqdm

load_dotenv()

True

## Text Generation

In [7]:
generator = pipeline('text-generation', framework='pt')

In [31]:
def generate_article_from_title(title: str) -> str:
    article = generator(title, max_length=random.randint(500, 1500))[0]['generated_text']
    if '.' in article:
        return article[article.index('.')+1:]
    return article

## Summarisation

In [9]:
summariser = pipeline('summarization')

In [15]:
def generate_summary_from_article(article: str) -> str:
    return summariser(article)[0]['summary_text']

## Keyword Finding

In [21]:
def find_keywords_in_article(article: str) -> List[str]:
    t = TextBlob(article)
    return [word for word, tag in  t.tags if tag == "NN"]

In [22]:
kws = find_keywords_in_article('On saturday, chris went to the hackathon and ate lots of pizza')

## Image Fetching

In [23]:
def image_url_from_keywords(keywords: List[str]) -> str:
    r = requests.get(f"https://source.unsplash.com/1600x900/?{','.join(keywords)}")
    return r.url

In [24]:
image_url = image_url_from_keywords(kws)

In [25]:
image_url

'https://images.unsplash.com/photo-1544882907-b914cebddbf4?crop=entropy&cs=tinysrgb&fit=crop&fm=jpg&h=900&ixid=eyJhcHBfaWQiOjF9&ixlib=rb-1.2.1&q=80&w=1600'

## Mongo Connection

In [26]:
client = pymongo.MongoClient(os.environ.get('mongo'))
db = client.news
collection = db.sources

## DataModel

In [27]:
@dataclass
class Post:
    title: str = None
    category: str = None
    author: str = field(default_factory=lambda: fake.name())
    description: str = None
    image: str = None
    slug: str = None
    html: str = None
    tags: List[str] = None
    date: str = field(default_factory=lambda: fake.date_this_year().isoformat())
    _id: str = None
        
    def to_dict(self):
        return self.__dict__
    
    def to_json(self):
        return json.dumps(self.to_dict())
    
    def fill_nones(self):
        if self.html is None:
            self.html = generate_article_from_title(title=self.title)
        if self.description is None:
            self.description = generate_summary_from_article(article=self.html)
        if self.tags is None:
            self.tags = find_keywords_in_article(self.html)
        if self.image is None or "picsum" in self.image:
            self.image = image_url_from_keywords(self.tags)
            
    def update(self):
        collection.update_one({"slug": self.slug}, {'$set': self.to_dict()})
            
    def pretty(self):
        pprint(self.to_dict())

## Fill Empties

In [34]:
for empty in collection.find({"description": None}):
    print(empty)

In [39]:
to_fill.description = None

' In the last series, Lily is the new witch and will be the mother of Hogwarts-Turing when she becomes the new Wizard of the World . She reveals that she is a living witch who came back to her homeland, to save her family and life . She wants to take an oath to defeat evil, and in order to do so, she wanders in the shadows of Hogwarts and captures Lord Voldemort . Unfortunately, after saving her from the Dark Lord, she is captured and killed by Harry Dresden .'

In [40]:
for redo in collection.find({}):
    print(redo)
    to_redo = Post(**redo)
    to_redo.description = None
    to_redo.fill_nones()
    collection.update_one({"_id": to_redo._id}, {'$set': to_redo.to_dict()})    

{'_id': ObjectId('5f8acf7cefb7b81d083f62e1'), 'title': '7% of home owners think their neighbours should be evicted', 'slug': '7-of-home-owners-think-their-neighbours-should-be-evicted', 'category': 'Politics', 'author': 'Sally Dibbert', 'date': '2020-10-17T11:03:23.742Z', 'description': '7% of home owners th', 'html': '7% of home owners think their neighbours should be evicted," said Michael Gagnon, managing director of the think tank, which has advised the government on many issues.\n\n"We think this is going to continue through the next election as voters continue to see that the housing market is still robust and that, by contrast, the numbers of people without health insurance are increasing."\n\nThe figures for home ownership in the G5 were similar, but there is a worrying trend on the rise of households now owning fewer than a second mortgage.\n\nWhile one-third of Australians own two or more properties with no-one else owning, more Australians, 26 per cent and 10 per cent, said 

Your max_length is set to 142, but you input_length is only 96. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


{'_id': ObjectId('5f8af0025e82c20008d06a7a'), 'title': 'How Mondays are becoming Friday’s in 2020', 'slug': 'how-mondays-are-becoming-fridays-in-2020', 'category': 'Business', 'author': 'Brandi West', 'date': '2020-10-17T13:22:08.531Z', 'image': 'https://images.unsplash.com/photo-1542617450-97d49ea1d30b?crop=entropy&cs=tinysrgb&fit=crop&fm=jpg&h=900&ixid=eyJhcHBfaWQiOjF9&ixlib=rb-1.2.1&q=80&w=1600', 'description': 'How Mondays are beco', 'html': "How Mondays are becoming Friday’s in 2020, this is a time that women and men are increasingly being allowed to earn and pay less.\n\nADVERTISEMENT Thanks for watching! Visit Website\n\nADVERTISEMENT Thanks for watching! Visit Website\n\nEven when women are being empowered not only by earning more and living in greater comfort, but by making more of a living on a daily basis, it's clear they are not just losing their jobs because they are working harder.", 'tags': ['s', 'time', 'living', 'comfort', 'living', 'basis', 'harder']}


Your max_length is set to 142, but you input_length is only 125. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


{'_id': ObjectId('5f8af00aa4aba60008415b91'), 'title': 'The latest cooking trend for those last minute lock down meals', 'slug': 'the-latest-cooking-trend-for-those-last-minute-lock-down-meals', 'category': 'Business', 'author': 'Felicia Schinner', 'date': '2020-10-17T13:22:17.410Z', 'image': 'https://images.unsplash.com/photo-1513125552702-3de13efeddc6?crop=entropy&cs=tinysrgb&fit=crop&fm=jpg&h=900&ixid=eyJhcHBfaWQiOjF9&ixlib=rb-1.2.1&q=80&w=1600', 'description': 'The latest cooking t', 'html': "The latest cooking trend for those last minute lock down meals seems to be to add a few more bits and dashes to a meal before you're served with it. Well, that may change soon based on what you're about to be served. As per the blog post, the next time you're asked how to make this tasty fried chickpea, you can just dip it in a slice of the crispy chickpea and it will be in your hand. This chickpea is a super simple and nutritious fried chickpea made with all the spices you might be looking to

In [33]:
for empty in tqdm(collection.find({"description": None})):
    print(empty)
    to_fill = Post(**empty)
    to_fill.fill_nones()
    collection.update_one({"_id": to_fill._id}, {'$set': to_fill.to_dict()})

0it [00:00, ?it/s]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8b93d7b153f90008a430b6'), 'title': 'Daniel Andrews Refuses To Scrap 9pm Curfew But Agrees To Let Melburnians Read In Bed Until 9:30', 'slug': 'daniel-andrews-refuses-to-scrap-9pm-curfew-but-agrees-to-let-melburnians-read-in-bed-until-9-30', 'category': 'Politics', 'author': 'Travis Denesik', 'date': '2020-10-18T01:01:10.039Z', 'image': 'https://picsum.photos/300/180'}


1it [01:17, 77.84s/it]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8b944e8724820008430c78'), 'title': 'Prince Harry is missing and has been ‘captured by wokeness’', 'slug': 'prince-harry-is-missing-and-has-been-captured-by-wokeness', 'category': 'Politics', 'author': 'Ms. Jeannie Stoltenberg', 'date': '2020-10-18T01:03:08.963Z', 'image': 'https://picsum.photos/300/180'}


2it [02:11, 65.93s/it]
