# My Sources Generation

In [3]:
from faker import Faker
fake = Faker(['en_AU', 'en_US', 'en_UK'])

In [20]:
import os
from dataclasses import dataclass, field
from typing import List
from pathlib import Path
from datetime import datetime
import random
import json
from pprint import pprint

from transformers import pipeline
from textblob import TextBlob
from dotenv import load_dotenv
import pymongo
import requests
import matplotlib.pyplot as plt
from tqdm import tqdm

load_dotenv()

True

## Text Generation

In [5]:
generator = pipeline('text-generation', framework='pt')

In [6]:
def generate_article_from_title(title: str) -> str:
    return generator(title, max_length=random.randint(500, 1500))[0]['generated_text']

## Summarisation

In [8]:
summariser = pipeline('summarization')

In [10]:
def generate_summary_from_article(article: str) -> str:
    return article[:20]

## Keyword Finding

In [13]:
def find_keywords_in_article(article: str) -> List[str]:
    t = TextBlob(article)
    return [word for word, tag in  t.tags if tag == "NN"]

In [14]:
kws = find_keywords_in_article('On saturday, chris went to the hackathon and ate lots of pizza')

## Image Fetching

In [15]:
def image_url_from_keywords(keywords: List[str]) -> str:
    r = requests.get(f"https://source.unsplash.com/1600x900/?{','.join(keywords)}")
    return r.url

In [16]:
from IPython.display import Image

In [17]:
image_url = image_url_from_keywords(kws)

In [18]:
image_url

'https://images.unsplash.com/photo-1593246049226-ded77bf90326?crop=entropy&cs=tinysrgb&fit=crop&fm=jpg&h=900&ixid=eyJhcHBfaWQiOjF9&ixlib=rb-1.2.1&q=80&w=1600'

## Mongo Connection

In [21]:
client = pymongo.MongoClient(os.environ.get('mongo'))
db = client.news
collection = db.sources

## DataModel

In [22]:
@dataclass
class Post:
    title: str = None
    category: str = None
    author: str = field(default_factory=lambda: fake.name())
    description: str = None
    image: str = None
    slug: str = None
    html: str = None
    tags: List[str] = None
    date: str = field(default_factory=lambda: fake.date_this_year().isoformat())
    _id: str = None
        
    def to_dict(self):
        return self.__dict__
    
    def to_json(self):
        return json.dumps(self.to_dict())
    
    def fill_nones(self):
        if self.html is None:
            self.html = generate_article_from_title(title=self.title)
        if self.description is None:
            self.description = generate_summary_from_article(article=self.html)
        if self.tags is None:
            self.tags = find_keywords_in_article(self.html)
        if self.image is None or "picsum" in self.image:
            self.image = image_url_from_keywords(self.tags)
            
    def update(self):
        collection.update_one({"slug": self.slug}, {'$set': self.to_dict()})
            
    def pretty(self):
        pprint(self.to_dict())

## Fill Empties

In [28]:
for empty in collection.find({"description": None}):
    print(empty)

{'_id': ObjectId('5f8b821d68fa84000899b826'), 'title': 'Apple ditching chargers saves costs but not the planet', 'slug': 'apple-ditching-chargers-saves-costs-but-not-the-planet', 'category': 'Tech', 'author': 'Kay Marquardt', 'date': '2020-10-17T23:45:32.291Z', 'image': 'https://picsum.photos/300/180'}
{'_id': ObjectId('5f8b85ae8193c43fc46544af'), 'title': '12 civilians killed in Armenian missile attack on Azerbaijan', 'slug': '12-civilians-killed-in-armenian-missile-attack-on-azerbaijan', 'category': 'World', 'author': 'Ervin Wyman', 'date': '2020-10-18T00:00:45.984Z', 'image': 'https://picsum.photos/300/180'}


In [27]:
for empty in tqdm(collection.find({"description": None})):
    print(empty)
    to_fill = Post(**empty)
    to_fill.fill_nones()
    collection.update_one({"_id": to_fill._id}, {'$set': to_fill.to_dict()})

0it [00:00, ?it/s]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8adedcc781a90007466cab'), 'title': 'The next avocado toast', 'slug': 'the-next-avocado-toast', 'category': 'Culture', 'author': 'Roderick Dickens', 'date': '2020-10-17T12:08:58.661Z', 'image': 'https://picsum.photos/300/180'}


1it [00:37, 37.99s/it]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8aeff14a0e4d00076986a0'), 'title': 'Why glasses are better than just human eyes', 'slug': 'why-glasses-are-better-than-just-human-eyes', 'category': 'Culture', 'author': 'Dr. Karen Langworth', 'date': '2020-10-17T13:21:52.272Z', 'image': 'https://picsum.photos/300/180'}


2it [01:48, 47.66s/it]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8aeffa7ff4bc0009f7aea8'), 'title': 'The end of the world already happened and now we have proof', 'slug': 'the-end-of-the-world-already-happened-and-now-we-have-proof', 'category': 'Science', 'author': 'Lillian Kub DDS', 'date': '2020-10-17T13:22:01.165Z', 'image': 'https://picsum.photos/300/180'}


3it [02:04, 38.16s/it]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8af0025e82c20008d06a7a'), 'title': 'How Mondays are becoming Friday’s in 2020', 'slug': 'how-mondays-are-becoming-fridays-in-2020', 'category': 'Business', 'author': 'Brandi West', 'date': '2020-10-17T13:22:08.531Z', 'image': 'https://picsum.photos/300/180'}


4it [02:11, 29.02s/it]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8af00aa4aba60008415b91'), 'title': 'The latest cooking trend for those last minute lock down meals', 'slug': 'the-latest-cooking-trend-for-those-last-minute-lock-down-meals', 'category': 'Business', 'author': 'Felicia Schinner', 'date': '2020-10-17T13:22:17.410Z', 'image': 'https://picsum.photos/300/180'}


5it [02:22, 23.42s/it]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8af014a8a75600084615a0'), 'title': 'Where the aviation industry is going from here', 'slug': 'where-the-aviation-industry-is-going-from-here', 'category': 'Business', 'author': 'Ms. Kimberly Herzog', 'date': '2020-10-17T13:22:26.583Z', 'image': 'https://picsum.photos/300/180'}


6it [02:41, 22.12s/it]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8af01d41fa7700085abd07'), 'title': 'The holiday you don’t need to leave your house for', 'slug': 'the-holiday-you-dont-need-to-leave-your-house-for', 'category': 'Culture', 'author': 'Beatrice Pagac', 'date': '2020-10-17T13:22:36.326Z', 'image': 'https://picsum.photos/300/180'}


7it [03:36, 32.11s/it]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8af029369c4a000596d588'), 'title': 'The same thing happened in august what trend keeps repeating itself', 'slug': 'the-same-thing-happened-in-august-what-trend-keeps-repeating-itself', 'category': 'Culture', 'author': 'Esther Dickens', 'date': '2020-10-17T13:22:48.067Z', 'image': 'https://picsum.photos/300/180'}


8it [03:59, 29.30s/it]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8af0423f11cc0008a51cfa'), 'title': 'The new iphone is the same as the last', 'slug': 'the-new-iphone-is-the-same-as-the-last', 'category': 'Tech', 'author': 'Alyssa Goodwin MD', 'date': '2020-10-17T13:23:12.505Z', 'image': 'https://picsum.photos/300/180'}


9it [04:10, 23.67s/it]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8af05f5c3e0c00083ea0aa'), 'title': 'Bus in middle east blows up killed 89 children', 'slug': 'bus-in-middle-east-blows-up-killed-89-children', 'category': 'World', 'author': 'Merle Lynch', 'date': '2020-10-17T13:23:42.256Z', 'image': 'https://picsum.photos/300/180'}


10it [04:24, 20.86s/it]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8b2754ca1d682824cf99d1'), 'title': 'Sex should be banned for people who are not same sex', 'slug': 'sex-should-be-banned-for-people-who-are-not-same-sex', 'category': 'Culture', 'author': 'Jack Connelly', 'date': '2020-10-17T17:18:11.613Z', 'image': 'https://picsum.photos/300/180'}


11it [05:29, 34.24s/it]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8b2aff53036435980d0764'), 'title': 'Sweden to increase military spending by 40% as tension with Russia grows', 'slug': 'sweden-to-increase-military-spending-by-40-as-tension-with-russia-grows', 'category': 'World', 'author': 'Lindsay Ferry', 'date': '2020-10-17T17:33:50.099Z', 'image': 'https://picsum.photos/300/180'}


12it [06:09, 35.81s/it]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8b2b46f8b02a3858816733'), 'title': 'Queensland election: LNP tries to keep anti-abortion push out of sight', 'slug': 'queensland-election-lnp-tries-to-keep-anti-abortion-push-out-of-sight', 'category': 'Politics', 'author': 'Mr. Marian Leffler', 'date': '2020-10-17T17:35:01.560Z', 'image': 'https://picsum.photos/300/180'}


13it [06:32, 32.06s/it]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8b821d68fa84000899b826'), 'title': 'Apple ditching chargers saves costs but not the planet', 'slug': 'apple-ditching-chargers-saves-costs-but-not-the-planet', 'category': 'Tech', 'author': 'Kay Marquardt', 'date': '2020-10-17T23:45:32.291Z', 'image': 'https://picsum.photos/300/180'}


13it [08:09, 37.65s/it]


IndexError: index out of range in self