/
main.py
106 lines (89 loc) · 5.89 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from fastapi import FastAPI, Query
from pydantic import BaseModel, Field
from typing import List
from datetime import datetime
from newspaper import Article
from date_guesser import guess_date, Accuracy
from langdetect import detect, detect_langs
app = FastAPI(
title="Article Scraper FastAPI Example Project",
description="This is an example project using FastAPI. This project re-implements another project built with the Flask framework. The original Flask implementation can be found here: https://github.com/kotartemiy/extract-news-api",
version="0.0.1"
)
class ArticleOut(BaseModel):
published_date: datetime = Field(None, title="The published date of the article", description="The published date of the article returned as a datetime. The published date is a 'best guess', the accuracy and method via which the published date has been determine are also returned as part of the response.")
published_date_method: str = Field(None, title="The method used to determine the published date", description="The method with which the published date of the article has been determined, returned as a string. The string will be a human readable explanation of how the date was determined.")
published_date_accuracy: str = Field(None, title="The level of accuracy for the published date", description="The level of accuracy with which the published date of the article has been determined. The default value is None, if the date could not be determined. If the date has been determined there are 3 levels of accuracy, partial, date, datetime.")
source_url: str = Field(..., title="The url of the article", description="The url of the source article. The url is set by the client when the request is made.")
title: str = Field (None, title="The title of the article")
title_lang: str = Field(None, title="The language used for the title of the article", description="The language used in the title of the source article. The language is auto-detected, accuracy is not guaranteed.")
text: str = Field(None, title="The text of the article", description="The text of the article, extracted from the article source url")
text_lang: str = Field(None, title="The language used for the text of the article", description="The language used in the text of the source article. The language is auto-detected, accuracy is not guaranteed.")
authors: list = Field(None, title="The author(s) of the article", description="The author(s) of the article.")
tags: list = Field(None, title="The tag(s) associated with the article", description="The tags(s) associated with the article.")
meta_keywords: list = Field(None, title="The meta keywords associated with the article", description="The meta keywords associated with the article, as derived from the meta keywords tag in the head section of the article.")
meta_description: str = Field(None, title="The meta description of the article", description="The meta desription of the article, as set in the meta_description head tag.")
meta_lang: str = Field(None, title="The language of the article", description="The language as set in the meta_lang field present in the head tags of the article.")
top_image: str = Field(None, title="The url for the top image from the article", description="The image in the top (mast) position from the article. The URL of the image is returned. The image itself is not returned.")
meta_image: str = Field(None, title="The url for the meta image from the article", description="The image defined in the meta fields from the article. The URL of the image is returned. The image itself is not returned. The meta image will be found in the og:image or og:image_url meta properties.")
images: list = Field(None, title="The url of each image found in the article", description="A list of the images found in the article. The image is represented by it's URL.")
class Config:
title = "Article"
@app.get("/v0/article", response_model=ArticleOut)
def get_article(
url: str = Query(
...,
title="Article URL",
description="The URL of the requested article."
)
):
article = Article(url)
article.download()
if (article.download_state == 2):
article.parse()
# Article
article_response = {}
article_response['source_url'] = article.url
article_response['title'] = article.title
article_response['text'] = article.text
article_response['authors'] = list(article.authors)
# Date Detection
try:
guess = guess_date(url = url, html = article.html)
article_response['published_date'] = guess.date
article_response['published_date_method'] = guess.method
# Date Accuracy
if guess.accuracy is Accuracy.PARTIAL:
article_response['published_date_accuracy'] = 'partial'
if guess.accuracy is Accuracy.DATE:
article_response['published_date_accuracy'] = 'date'
if guess.accuracy is Accuracy.DATETIME:
article_response['published_date_accuracy'] = 'datetime'
if guess.accuracy is Accuracy.NONE:
article_response['published_date_accuracy'] = None
except:
article_response['published_date'] = article.published_date
article_response['published_date_method'] = None
article_response['published_date_accuracy'] = None
# Language Detection
try:
article_response['title_lang'] = detect(article.title)
except:
article_response['title_lang'] = None
try:
article_response['text_lang'] = detect(article.text)
except:
article_response['text_lang'] = None
# Meta / Other
article_response['meta_description'] = article.meta_description
article_response['meta_lang'] = article.meta_lang
article_response['meta_favicon'] = article.meta_favicon
article_response['meta_keywords'] = list(article.meta_keywords)
article_response['tags'] = list(article.tags)
# Images
article_response['images'] = list(article.images)
article_response['meta_image'] = article.meta_img
article_response['top_image'] = article.top_image
return ArticleOut(**article_response)
else:
raise HTTPException(status_code=404, detail="Article was not found")