In [2]:
import re


def load_books_by_year(filepath):
    
    books = []
    current_year = None
    pattern = re.compile(r'^(.*?)\s+by\s+(.*)$', re.IGNORECASE)

    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            for line in file:
                line = line.strip()

                if not line:
                    continue  

                if line.isdigit() and len(line) == 4:
                    current_year = int(line)
                    continue

                if current_year:
                    match = pattern.match(line)
                    if match:
                        title = match.group(1).strip()
                        author = match.group(2).strip()
                        books.append({
                            'title': title,
                            'author': author,
                            'year': current_year
                        })

    except FileNotFoundError:
        print(f"File '{filepath}' not found.")
    except Exception as e:
        print(f"File processing error: {e}")

    return books


all_books = load_books_by_year(filepath = 'source_data.txt')
print(all_books[:3])

print(len(all_books))

[{'title': 'Love Story', 'author': 'Erich Segal', 'year': 1970}, {'title': "The French Lieutenant's Woman", 'author': 'John Fowles', 'year': 1970}, {'title': 'Islands in the Stream', 'author': 'Ernest Hemingway', 'year': 1970}]
550


In [3]:
import pandas as pd


def load_book_info(filepath):
    try:
        df = pd.read_csv(filepath, delimiter=';', encoding='utf-8')

        lst_values = df.to_dict(orient='records')

        lst_keys = [(d['title'], d['author']) for d in lst_values]

        book_info = dict(zip(lst_keys, lst_values))

        return book_info

    except FileNotFoundError:
        print(f"File '{filepath}' not found.")
    except KeyError as e:
        print(f"File processing error: {e}")

    return {}


# def try_parse_float(value):
#     try:
#         return float(value)
#     except (ValueError, TypeError):
#         return None

# def try_parse_int(value):
#     try:
#         return int(float(value))
#     except (ValueError, TypeError):
#         return None


book_info = load_book_info("book_info.csv")


print(list(book_info.items())[10:13])


[(('A Man in Full', 'Tom Wolfe'), {'title': 'A Man in Full', 'author': 'Tom Wolfe', 'categories': 'Fiction', 'pageCount': 756.0, 'averageRating': nan, 'ratingsCount': nan, 'publishedDate': '2010-04-01', 'description': "Tom Wolfe's THE BONFIRE OF THE VANITIES defined an era and established Wolfe as our prime fictional chronicler of America at its most outrageous and alive. In his #1 New York Times bestseller and National Book Award finalist, A MAN IN FULL, the setting shifts to Atlanta, Georgia—a racially mixed lat"}), (('A Painted House', 'John Grisham'), {'title': 'A Painted House', 'author': 'John Grisham', 'categories': 'Fiction', 'pageCount': 394.0, 'averageRating': nan, 'ratingsCount': nan, 'publishedDate': '2001', 'description': "It is the 1950s in Arkansas, the cotton season and to the Chandler family it means hard work, temporary workers and a host of complications. A story inspired by Grisham's own childhood in rural Arkansas. The narrator is a seven year old farm boy, who liv

In [3]:
def enrich_books(book_list, info_dict):

    book_info_low = {
        (title.strip().lower(), author.strip().lower()): data
        for (title, author), data in info_dict.items()
    }

    for book in book_list:
        key = (book['title'].strip().lower(), book['author'].strip().lower())
        match = book_info_low.get(key)

        if match:
            book.update(match)
            book['found'] = True
        else:
            book['found'] = False

    return all_books


enriched_books = enrich_books(all_books, book_info)

print(enriched_books[1:3])
print(len(enriched_books))

[{'title': "The French Lieutenant's Woman", 'author': 'John Fowles', 'year': 1970, 'categories': nan, 'pageCount': 518.0, 'averageRating': nan, 'ratingsCount': nan, 'publishedDate': '1969', 'description': nan, 'found': True}, {'title': 'Islands in the Stream', 'author': 'Ernest Hemingway', 'year': 1970, 'categories': 'Fiction', 'pageCount': 496.0, 'averageRating': nan, 'ratingsCount': nan, 'publishedDate': '2014-05-22', 'description': "First published in 1970, nine years after Hemingway's death, this is the story of an artist and adventurer—a man much like Hemingway himself. Beginning in the 1930s, Islands in the Stream follows the fortunes of Thomas Hudson, from his experiences as a painter on the Gulf Stream island of Bimini thr", 'found': True}]
550


In [6]:
import json


def save_books_to_json(books, filepath):
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(all_books, f, ensure_ascii=False, indent=4)

    
save_books_to_json(enriched_books, filepath = 'enriched_books.json')

In [None]:
from pprint import pprint


def top_longest_books(books, top_n=3):

    df = pd.read_csv(books, delimiter=';', encoding='utf-8')

    df['pageCount'] = pd.to_numeric(df['pageCount'], errors='coerce')

    df_valid = df[df['pageCount'].notna()]

    df_sort = df_valid.sort_values(by='pageCount', ascending=False).head(top_n)

    top_books = df_sort.to_dict(orient='records')


    return top_books

pprint(top_longest_books('book_info.csv'))

[{'author': 'Sarah J. Maas',
  'averageRating': nan,
  'categories': 'Fiction',
  'description': 'Lose yourself in the seductive world of the Court of Thorns '
                 'and Roses series by internationally bestselling author Sarah '
                 'J. Maas with this five-ebook bundle. "Passionate, violent, '
                 'sexy and daring." -USA TODAY on A Court of Thorns and Roses '
                 'Feyre is a huntress. The skin of a wolf would bring enough g',
  'pageCount': 2964.0,
  'publishedDate': '2022-05-05',
  'ratingsCount': nan,
  'title': 'A Court of Thorns and Roses'},
 {'author': 'Patricia Cornwell',
  'averageRating': 5.0,
  'categories': 'Fiction',
  'description': 'Five action-packed thrillers in the #1 New York Times '
                 'bestselling Scarpetta series: Cause of Death, Unnatural '
                 'Exposure, Point of Origin, and Trace. “Cornwell remains the '
                 'master of incorporating real-life science into '
                

In [14]:
from collections import Counter


def most_frequent_authors(books, top_n=3):

    auth_list = list(i['author'] for i in books)

    counts = Counter(auth_list)

    mydict = dict(counts)

    top3 = sorted(mydict.items(), key=lambda x: x[1], reverse=True)[:top_n]

    return top3

most_frequent_authors(enriched_books)

[('Stephen King', 32), ('Danielle Steel', 32), ('John Grisham', 31)]