# My Sources Generation

In [3]:
from faker import Faker
fake = Faker(['en_AU', 'en_US', 'en_UK'])

In [20]:
import os
from dataclasses import dataclass, field
from typing import List
from pathlib import Path
from datetime import datetime
import random
import json
from pprint import pprint

from transformers import pipeline
from textblob import TextBlob
from dotenv import load_dotenv
import pymongo
import requests
import matplotlib.pyplot as plt
from tqdm import tqdm

load_dotenv()

True

## Text Generation

In [5]:
generator = pipeline('text-generation', framework='pt')

In [6]:
def generate_article_from_title(title: str) -> str:
    return generator(title, max_length=random.randint(500, 1500))[0]['generated_text']

## Summarisation

In [8]:
summariser = pipeline('summarization')

In [10]:
def generate_summary_from_article(article: str) -> str:
    return article[:20]

## Keyword Finding

In [13]:
def find_keywords_in_article(article: str) -> List[str]:
    t = TextBlob(article)
    return [word for word, tag in  t.tags if tag == "NN"]

In [14]:
kws = find_keywords_in_article('On saturday, chris went to the hackathon and ate lots of pizza')

## Image Fetching

In [15]:
def image_url_from_keywords(keywords: List[str]) -> str:
    r = requests.get(f"https://source.unsplash.com/1600x900/?{','.join(keywords)}")
    return r.url

In [16]:
from IPython.display import Image

In [17]:
image_url = image_url_from_keywords(kws)

In [18]:
image_url

'https://images.unsplash.com/photo-1593246049226-ded77bf90326?crop=entropy&cs=tinysrgb&fit=crop&fm=jpg&h=900&ixid=eyJhcHBfaWQiOjF9&ixlib=rb-1.2.1&q=80&w=1600'

## Mongo Connection

In [21]:
client = pymongo.MongoClient(os.environ.get('mongo'))
db = client.news
collection = db.sources

## DataModel

In [22]:
@dataclass
class Post:
    title: str = None
    category: str = None
    author: str = field(default_factory=lambda: fake.name())
    description: str = None
    image: str = None
    slug: str = None
    html: str = None
    tags: List[str] = None
    date: str = field(default_factory=lambda: fake.date_this_year().isoformat())
    _id: str = None
        
    def to_dict(self):
        return self.__dict__
    
    def to_json(self):
        return json.dumps(self.to_dict())
    
    def fill_nones(self):
        if self.html is None:
            self.html = generate_article_from_title(title=self.title)
        if self.description is None:
            self.description = generate_summary_from_article(article=self.html)
        if self.tags is None:
            self.tags = find_keywords_in_article(self.html)
        if self.image is None or "picsum" in self.image:
            self.image = image_url_from_keywords(self.tags)
            
    def update(self):
        collection.update_one({"slug": self.slug}, {'$set': self.to_dict()})
            
    def pretty(self):
        pprint(self.to_dict())

## Fill Empties

In [31]:
for empty in collection.find({"description": None}):
    print(empty)

{'_id': ObjectId('5f8b8d9a510e95000873d4e7'), 'title': "Order Some Bubble Tea And We'll Guess Your Zodiac Sign With 97.3% Accuracy", 'slug': 'order-some-bubble-tea-and-well-guess-your-zodiac-sign-with-973-accuracy', 'category': 'Culture', 'author': 'Joel Kassulke III', 'date': '2020-10-18T00:34:33.350Z', 'image': 'https://picsum.photos/300/180'}
{'_id': ObjectId('5f8b8ed796a0da0007759956'), 'title': 'The Catholic Church is ‘disgraced’ by ‘selling out to China’', 'slug': 'the-catholic-church-is-disgraced-by-selling-out-to-china', 'category': 'World', 'author': 'Jackie Kris', 'date': '2020-10-18T00:39:50.050Z', 'image': 'https://picsum.photos/300/180'}
{'_id': ObjectId('5f8b8f0175e6fa0008203097'), 'title': 'Invest in Amazon from $250. You could earn up to $2000!', 'slug': 'invest-in-amazon-from-250-you-could-earn-up-to-2000', 'category': 'Business', 'author': 'Johnathan Ebert', 'date': '2020-10-18T00:40:32.232Z', 'image': 'https://picsum.photos/300/180'}


In [None]:
for empty in tqdm(collection.find({"description": None})):
    print(empty)
    to_fill = Post(**empty)
    to_fill.fill_nones()
    collection.update_one({"_id": to_fill._id}, {'$set': to_fill.to_dict()})

0it [00:00, ?it/s]Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


{'_id': ObjectId('5f8b8d9a510e95000873d4e7'), 'title': "Order Some Bubble Tea And We'll Guess Your Zodiac Sign With 97.3% Accuracy", 'slug': 'order-some-bubble-tea-and-well-guess-your-zodiac-sign-with-973-accuracy', 'category': 'Culture', 'author': 'Joel Kassulke III', 'date': '2020-10-18T00:34:33.350Z', 'image': 'https://picsum.photos/300/180'}
