This notebook contains code that cleans and extracts information from the pre-existing movies dataset. It also contains code to scrape similar data from the goodreads Top Books of the Twentieth Century list. These two dataframes are merged into one and unnecessary information is dropped. The final output is combined dataframe and this is output to a csv file called 'books_and_movies.csv'. 

In [230]:
import pandas as pd
filename = 'MovieSummaries/plot_summaries.txt'
movie_summaries_df = pd.read_csv(filename, delimiter="\t", header=None, names = ['Wikipedia Movie ID', 'Synopsis']).set_index('Wikipedia Movie ID')
movie_summaries_df.sort_index(inplace=True)
print (movie_summaries_df)

                                                             Synopsis
Wikipedia Movie ID                                                   
330                 In order to prepare the role of an important o...
3217                After being pulled through a time portal, Ash ...
3333                 The film follows two juxtaposed families: the...
3746                {{Hatnote}} In Los Angeles, November 2019, ret...
3837                In the American Old West of 1874, construction...
...                                                               ...
37373877            According to Horrorfest Online, six people wit...
37473592            Thoppul kodihttp://qualitymp3.blogspot.in/2011...
37478048            Anand Verma, a widower and father of a child, ...
37492363            When Clover's ' childhood friend, Cherries ', ...
37501922            Two adolescent children of wealthy parents dea...

[42303 rows x 1 columns]


In [231]:
# Read other movie data from 'movie.metadata.tsv' into another data frame and drop unnecessary columns such as Freebase Movie ID, Revenue, Release Date, Runtime, Languages and Countries

metadata_file = 'MovieSummaries/movie.metadata.tsv'

movie_metadata_df = pd.read_csv(metadata_file, delimiter="\t", header=None, names = ['Wikipedia Movie ID', 'Freebase Movie ID', 'Name', 'Release Date', 'Revenue', 'Runtime', 'Languages', 'Countries', 'Genres']).set_index('Wikipedia Movie ID')

movie_metadata_df.drop(columns = ['Freebase Movie ID', 'Release Date', 'Revenue', 'Runtime', 'Languages', 'Countries'], inplace=True)
movie_metadata_df.sort_index(inplace=True)
print(movie_metadata_df)


                                     Name  \
Wikipedia Movie ID                          
330                               Actrius   
3217                     Army of Darkness   
3333                The Birth of a Nation   
3746                         Blade Runner   
3837                      Blazing Saddles   
...                                   ...   
37473592                     Thoppul Kodi   
37476824                  I Love New Year   
37478048                      Mr. Bechara   
37492363              Cherries and Clover   
37501922                   Terminal Bliss   

                                                               Genres  
Wikipedia Movie ID                                                     
330                 {"/m/07s9rl0": "Drama", "/m/01t_vv": "Comedy-d...  
3217                {"/m/01q03": "Cult", "/m/03npn": "Horror", "/m...  
3333                {"/m/06ppq": "Silent film", "/m/0219x_": "Indi...  
3746                {"/m/01jfsb": "Thriller", "/m/01qp

In [517]:
import re

# join the two dataframes 
movies_df = movie_summaries_df.join(movie_metadata_df)

movies_df['Genres'] = movies_df['Genres'].apply(lambda x: re.findall(r'".*?":\s+"(.*?)"[,|}]',str(x)))

movies_df

Unnamed: 0_level_0,Synopsis,Name,Genres
Wikipedia Movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
330,In order to prepare the role of an important o...,Actrius,"[Drama, Comedy-drama]"
3217,"After being pulled through a time portal, Ash ...",Army of Darkness,"[Cult, Horror, Stop motion, Costume drama, Act..."
3333,The film follows two juxtaposed families: the...,The Birth of a Nation,"[Silent film, Indie, Costume drama, Epic, Blac..."
3746,"{{Hatnote}} In Los Angeles, November 2019, ret...",Blade Runner,"[Thriller, Cyberpunk, Science Fiction, Future ..."
3837,"In the American Old West of 1874, construction...",Blazing Saddles,"[Western, Satire, Comedy]"
...,...,...,...
37373877,"According to Horrorfest Online, six people wit...",Crazy Eights,"[Cult, Horror]"
37473592,Thoppul kodihttp://qualitymp3.blogspot.in/2011...,Thoppul Kodi,[Drama]
37478048,"Anand Verma, a widower and father of a child, ...",Mr. Bechara,[Comedy film]
37492363,"When Clover's ' childhood friend, Cherries ', ...",Cherries and Clover,"[Comedy film, Drama, Romance Film]"


In [161]:
# get html content of Goodreads Best Books of the 20th Century list 

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

page_url = 'https://www.goodreads.com/list/show/6.Best_Books_of_the_20th_Century'

def get_page_html(url):
    content = requests.get(url)
    return content.text
    
top_books_html = get_page_html(page_url)

In [171]:
#build web scraper to get title, author, and link to details page for books and put info into dataframe

def get_all_books(books_df):
    for i in range(2,79):
        url = 'https://www.goodreads.com/list/show/6.Best_Books_of_the_20th_Century?page={}'.format(i)
        html = get_page_html(url)
        books_df = books_df.append(get_books(html), ignore_index=True)
    return books_df
        

def get_books(top_books_html):
    parser = BeautifulSoup(top_books_html, 'html.parser')
    books = parser.find('table', class_='tableList js-dataTooltip').find_all('tr', itemscope ='')
    book_data = [get_info(book) for book in books]
    return pd.DataFrame(data=book_data, columns = ['Name','Author','Detail_URL'])

 
def get_info(book):
    title = book.find('a', class_='bookTitle').get_text().strip()
    author = book.find('span', itemprop='author').get_text().strip()
    link = book.find('a').get('href')
    link = str('https://www.goodreads.com' + link)
    return title, author, link 

books_df = get_books(top_books_html)
books_df = get_all_books(books_df)
books_df

Unnamed: 0,Name,Author,Detail_URL
0,"To Kill a Mockingbird (To Kill a Mockingbird, #1)",Harper Lee,https://www.goodreads.com/book/show/2657.To_Ki...
1,1984,George Orwell,https://www.goodreads.com/book/show/5470.1984
2,The Great Gatsby,F. Scott Fitzgerald,https://www.goodreads.com/book/show/4671.The_G...
3,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling,https://www.goodreads.com/book/show/3.Harry_Po...
4,Animal Farm,George Orwell,https://www.goodreads.com/book/show/7613.Anima...
...,...,...,...
7795,The Proteus Operation,James P. Hogan,https://www.goodreads.com/book/show/849493.The...
7796,"Kingdom of Shadows (Night Soldiers, #6)",Alan Furst,https://www.goodreads.com/book/show/253556.Kin...
7797,"Na klar, Lotta kann Rad fahren",Astrid Lindgren,https://www.goodreads.com/book/show/990527.Na_...
7798,Hanu Ancuței. Baltagul,Mihail Sadoveanu,https://www.goodreads.com/book/show/28231145-h...


In [251]:
mockingbird_url = 'https://www.goodreads.com/book/show/73687.The_Clan_of_the_Cave_Bear_the_Valley_of_Horses_the_Mammoth_Hunters_the_Plains_of_Passage_The_Shelters_of_Stone'
mockingbird_html = get_page_html(mockingbird_url) 

In [252]:
# Now we make a function that gets the book synopsis and genres from the book details url 
def extract_book_details(html):
    parser = BeautifulSoup(html, 'html.parser')
    
    if parser.find('div', id='description') == None:
        synopsis = parser.find('div', id='description')
    elif len(parser.find('div', id='description').find_all('span')) > 1:
        synopsis = parser.find('div', id='description').find_all('span')[1].get_text()
    else:
        synopsis = parser.find('div', id='descriptionContainer').find_all('span')[0].get_text()
    genres = [a.get_text().strip().split()[0] for a in parser.find('div', class_='rightContainer').find_all('div', class_='stacked')[1].find_all('div', class_='elementList')]
    
    book_details = dict()
    book_details['Synopsis']= synopsis
    book_details['Genres'] = genres 
    return book_details 
extract_book_details(mockingbird_html)
    

{'Synopsis': "Contains Five Audiobooks by Jean M. Auel: The Clan of the Cave Bear, The Valley of Horses, The Mammoth Hunters, The Plains of Passage, and The Shelters of Stone, and One Soul? MP3-CD Audiobook Player, Model DMP-206b The Clan of the Cave Bear: A remarkable epic of one woman's odyssey - filled with mystery and magic. The Valley of Horses: A timeless epic of the dawn of civilization. Ayla sets out on her own odyssey of discovery away from the nurturing adoptive family and friends of the Clan. The Mammoth Hunters: An epic novel of love, knowledge, jealousy, and hard choices. Ayla sets out from the valley on Whinney, the horse she tamed. The Plains of Passage: In The Plains of Passage, orphaned Ayla and wandering Jondalar search for a place on Earth they can call home. The Shelters of Stone: Jondalar and Ayla journey to the home of his people, and find people wary of the beautiful young woman he has brought back, with her strange accent and her tame wolf and horses.",
 'Genres

In [510]:
# get the url of the books and set the index as the Name of the respective book
urls = books_df.set_index('Name')['Detail_URL']
urls

Name
To Kill a Mockingbird (To Kill a Mockingbird, #1)                                             https://www.goodreads.com/book/show/2657.To_Ki...
1984                                                                                              https://www.goodreads.com/book/show/5470.1984
The Great Gatsby                                                                              https://www.goodreads.com/book/show/4671.The_G...
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)                                      https://www.goodreads.com/book/show/3.Harry_Po...
Animal Farm                                                                                   https://www.goodreads.com/book/show/7613.Anima...
                                                                                                                    ...                        
The Glass Palace                                                                              https://www.goodreads.com/book/show/7

In [225]:
# cycle through the urls for all the books and extract all the book details and append to a list
ls = []
for url in urls:
    html = get_page_html(url)
    ls+=[extract_book_details(html)]


AttributeError: 'NoneType' object has no attribute 'find_all'

In [557]:
print(len(ls))

2326


In [518]:
# Take first 2326 books from books_df since the list of book details is only 2326 books long 
books = books_df.drop(books_df.index[2326:])
books

Unnamed: 0,Name,Author,Detail_URL
0,"To Kill a Mockingbird (To Kill a Mockingbird, #1)",Harper Lee,https://www.goodreads.com/book/show/2657.To_Ki...
1,1984,George Orwell,https://www.goodreads.com/book/show/5470.1984
2,The Great Gatsby,F. Scott Fitzgerald,https://www.goodreads.com/book/show/4671.The_G...
3,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling,https://www.goodreads.com/book/show/3.Harry_Po...
4,Animal Farm,George Orwell,https://www.goodreads.com/book/show/7613.Anima...
...,...,...,...
2321,The Glass Palace,Amitav Ghosh,https://www.goodreads.com/book/show/77103.The_...
2322,Company K,William March,https://www.goodreads.com/book/show/632164.Com...
2323,The October Country,Ray Bradbury,https://www.goodreads.com/book/show/93251.The_...
2324,"The Redneck Manifesto: How Hillbillies, Hicks,...",Jim Goad,https://www.goodreads.com/book/show/81201.The_...


In [533]:
# convert list of books details to a dataframe and add the names of the books as indices 
book_details = pd.DataFrame(ls)
book_details.insert(0,'Name', urls.index) 
book_details


Unnamed: 0,Name,Synopsis,Genres
0,"To Kill a Mockingbird (To Kill a Mockingbird, #1)",The unforgettable novel of a childhood in a sl...,"[Classics, Fiction, Historical, Academic]"
1,1984,"The year 1984 has come and gone, but George Or...","[Classics, Fiction, Science, Science]"
2,The Great Gatsby,Alternate Cover Edition ISBN: 0743273567 (ISBN...,"[Classics, Fiction, Academic, Literature, Hist..."
3,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter's life is miserable. His parents ...,"[Fantasy, Young, Fiction]"
4,Animal Farm,George Orwell's timeless and timely allegorica...,"[Classics, Fiction, Science, Fantasy, Literature]"
...,...,...,...
2321,The Glass Palace,Set in Burma during the British invasion of 18...,"[Fiction, Historical, Cultural, Cultural, Hist..."
2322,Company K,With an Introduction by Philip D. Beidler This...,"[Fiction, War, Historical, War, War]"
2323,The October Country,Ray Bradbury's second short story collection i...,"[Horror, Short, Fiction, Fantasy, Science, Cla..."
2324,"The Redneck Manifesto: How Hillbillies, Hicks,...",Culture maverick Jim Goad presents a thoroughl...,"[Nonfiction, Politics, Sociology, History, Race]"


In [537]:
# merge the original books dataframe and the book details dataframe into one 
merged = pd.merge(books, book_details, right_index=True, left_index=True, on='Name')
merged

Unnamed: 0,Name,Author,Detail_URL,Synopsis,Genres
0,"To Kill a Mockingbird (To Kill a Mockingbird, #1)",Harper Lee,https://www.goodreads.com/book/show/2657.To_Ki...,The unforgettable novel of a childhood in a sl...,"[Classics, Fiction, Historical, Academic]"
1,1984,George Orwell,https://www.goodreads.com/book/show/5470.1984,"The year 1984 has come and gone, but George Or...","[Classics, Fiction, Science, Science]"
2,The Great Gatsby,F. Scott Fitzgerald,https://www.goodreads.com/book/show/4671.The_G...,Alternate Cover Edition ISBN: 0743273567 (ISBN...,"[Classics, Fiction, Academic, Literature, Hist..."
3,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling,https://www.goodreads.com/book/show/3.Harry_Po...,Harry Potter's life is miserable. His parents ...,"[Fantasy, Young, Fiction]"
4,Animal Farm,George Orwell,https://www.goodreads.com/book/show/7613.Anima...,George Orwell's timeless and timely allegorica...,"[Classics, Fiction, Science, Fantasy, Literature]"
...,...,...,...,...,...
2321,The Glass Palace,Amitav Ghosh,https://www.goodreads.com/book/show/77103.The_...,Set in Burma during the British invasion of 18...,"[Fiction, Historical, Cultural, Cultural, Hist..."
2322,Company K,William March,https://www.goodreads.com/book/show/632164.Com...,With an Introduction by Philip D. Beidler This...,"[Fiction, War, Historical, War, War]"
2323,The October Country,Ray Bradbury,https://www.goodreads.com/book/show/93251.The_...,Ray Bradbury's second short story collection i...,"[Horror, Short, Fiction, Fantasy, Science, Cla..."
2324,"The Redneck Manifesto: How Hillbillies, Hicks,...",Jim Goad,https://www.goodreads.com/book/show/81201.The_...,Culture maverick Jim Goad presents a thoroughl...,"[Nonfiction, Politics, Sociology, History, Race]"


In [538]:
# drop the url and author columns in teh final dataframe as this information is no longer needed 
final_books_df = merged.drop(columns = ['Author', 'Detail_URL'])
final_books_df

Unnamed: 0,Name,Synopsis,Genres
0,"To Kill a Mockingbird (To Kill a Mockingbird, #1)",The unforgettable novel of a childhood in a sl...,"[Classics, Fiction, Historical, Academic]"
1,1984,"The year 1984 has come and gone, but George Or...","[Classics, Fiction, Science, Science]"
2,The Great Gatsby,Alternate Cover Edition ISBN: 0743273567 (ISBN...,"[Classics, Fiction, Academic, Literature, Hist..."
3,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter's life is miserable. His parents ...,"[Fantasy, Young, Fiction]"
4,Animal Farm,George Orwell's timeless and timely allegorica...,"[Classics, Fiction, Science, Fantasy, Literature]"
...,...,...,...
2321,The Glass Palace,Set in Burma during the British invasion of 18...,"[Fiction, Historical, Cultural, Cultural, Hist..."
2322,Company K,With an Introduction by Philip D. Beidler This...,"[Fiction, War, Historical, War, War]"
2323,The October Country,Ray Bradbury's second short story collection i...,"[Horror, Short, Fiction, Fantasy, Science, Cla..."
2324,"The Redneck Manifesto: How Hillbillies, Hicks,...",Culture maverick Jim Goad presents a thoroughl...,"[Nonfiction, Politics, Sociology, History, Race]"


In [550]:
# Clean up movies dataframe by dropping the wikipedia id and reordering the columns to match the books dataframe 
final_movies_df = movies_df.reset_index()
final_movies_df.drop(columns=['Wikipedia Movie ID'], inplace=True)
final_movies_df = final_movies_df[['Name', 'Synopsis', 'Genres']]
final_movies_df

Unnamed: 0,Name,Synopsis,Genres
0,Actrius,In order to prepare the role of an important o...,"[Drama, Comedy-drama]"
1,Army of Darkness,"After being pulled through a time portal, Ash ...","[Cult, Horror, Stop motion, Costume drama, Act..."
2,The Birth of a Nation,The film follows two juxtaposed families: the...,"[Silent film, Indie, Costume drama, Epic, Blac..."
3,Blade Runner,"{{Hatnote}} In Los Angeles, November 2019, ret...","[Thriller, Cyberpunk, Science Fiction, Future ..."
4,Blazing Saddles,"In the American Old West of 1874, construction...","[Western, Satire, Comedy]"
...,...,...,...
42298,Crazy Eights,"According to Horrorfest Online, six people wit...","[Cult, Horror]"
42299,Thoppul Kodi,Thoppul kodihttp://qualitymp3.blogspot.in/2011...,[Drama]
42300,Mr. Bechara,"Anand Verma, a widower and father of a child, ...",[Comedy film]
42301,Cherries and Clover,"When Clover's ' childhood friend, Cherries ', ...","[Comedy film, Drama, Romance Film]"


In [553]:
# append the movies dataframe to the books dataframe and reset the index so the indices are ordered correctly 
books_and_movies_df = final_books_df.append(final_movies_df).reset_index()
books_and_movies_df.drop(columns=['index'], inplace=True)
books_and_movies_df

Unnamed: 0,Name,Synopsis,Genres
0,"To Kill a Mockingbird (To Kill a Mockingbird, #1)",The unforgettable novel of a childhood in a sl...,"[Classics, Fiction, Historical, Academic]"
1,1984,"The year 1984 has come and gone, but George Or...","[Classics, Fiction, Science, Science]"
2,The Great Gatsby,Alternate Cover Edition ISBN: 0743273567 (ISBN...,"[Classics, Fiction, Academic, Literature, Hist..."
3,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter's life is miserable. His parents ...,"[Fantasy, Young, Fiction]"
4,Animal Farm,George Orwell's timeless and timely allegorica...,"[Classics, Fiction, Science, Fantasy, Literature]"
...,...,...,...
44624,Crazy Eights,"According to Horrorfest Online, six people wit...","[Cult, Horror]"
44625,Thoppul Kodi,Thoppul kodihttp://qualitymp3.blogspot.in/2011...,[Drama]
44626,Mr. Bechara,"Anand Verma, a widower and father of a child, ...",[Comedy film]
44627,Cherries and Clover,"When Clover's ' childhood friend, Cherries ', ...","[Comedy film, Drama, Romance Film]"


In [556]:
# write the final books and movies dataframe to a csv file 
filename='books_and_movies.csv'
books_and_movies_df.to_csv(filename, index=False)