# #readMoreCanlit | Notebook 3: Data cleaning

### Imports

In [14]:
# pandas and numpy
import pandas as pd
import numpy as np

# nltk imports
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

# sci-kit learn imports
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.pipeline import Pipeline, make_pipeline

# Presentation and visuals
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# This magic line will allow you to generate plots
# within the Jupyter notebook.
%matplotlib inline
from pprint import pprint
pd.options.display.max_seq_items = 2000
pd.options.display.max_rows = 4000

# pd.set_option(display.max_columns), None

# other imports
import json
import lxml
from lxml import html
import random
import regex as re
import requests
import time
import urllib.request
from datetime import datetime



## Data cleaning

### Canadian

In [47]:
# Create a stopwords list of meta-critical commentary vocabulary to be removed from the Canadian dataset as part of preprocessing

literary_stopwords = ['amazon best book of the year', 'arthur ellis', 'astonishing', 'author', 'award', 'award-winning', 'best', 'best book of the year', 'bestseller', 'bestselling', 'book', 'book award', 'boston globe', 'canada reads', 'category', 'character', 'classic', 'critically acclaimed', 'debut', 'entertainment weekly', 'epic', 'finalist', 'finalist', 'foremost', 'giller prize', 'giller prize', 'globe and mail', 'governor generals award', 'governor generals literary award', 'harpercollins', 'heralded', 'highly anticipated', 'kobo', 'literary', 'literature', 'longlisted', 'national bestseller', 'nationally', 'new york times', 'novel', 'prize', 'prize-winning', 'publish', 'publishers weekly', 'rogers writers trust', 'scotiabank', 'shortlisted', 'story', 'ubc', 'widely anticipated', 'winner', 'writers trust fiction prize', 'writers trust of canada']

In [78]:
# Read in the Canadian data

canadian = pd.read_csv('../data/processed/canadian_books.csv')
canadian = canadian.applymap(str)
canadian.shape

(6775, 6)

In [79]:
# Create a wordcloud from the Canadian descriptions

# text = canadian.description.values
# wordcloud = WordCloud(
#     width = 3000,
#     height = 2000,
#     background_color = 'white',
#     stopwords = STOPWORDS).generate(str(text))
# fig = plt.figure(
#     figsize = (40, 30),
#     facecolor = 'k',
#     edgecolor = 'k')
# plt.imshow(wordcloud, interpolation = 'bilinear')
# plt.axis('off')
# plt.tight_layout(pad=0)
# plt.show()

In [80]:
canadian.drop(['image'], axis=1, inplace=True)

In [81]:
canadian

Unnamed: 0,id,origin,title,author,description
0,0,Canadian,01 Nathaniel Mcdaniel and Bigbeards Hook,Evan Solomon,"Meet Nathaniel McDaniel, the mischievous hero ..."
1,1,Canadian,02 Standard of Honor Book Two of the Templar T...,Jack Whyte,Jack Whyte’s thrilling Templar Trilogy continu...
2,2,Canadian,03 Knights Templar Order in Chaos,Jack Whyte,"In the final novel in the Templar Trilogy, Wil..."
3,3,Canadian,100 Easy-to-Grow Native Plants for Canadian Ga...,Lorraine Johnson,The key to a carefree garden is to know which ...
4,4,Canadian,10 Women,George Bowering,Ten Women is a new collection of short fiction...
...,...,...,...,...,...
6770,6770,Canadian,Zero Day,Ezekiel Boone,"The wildly entertaining, deeply satisfying fin..."
6771,6771,Canadian,Zip's File,Shannon Maguire,Zip's File: A Romance of Silence explores the ...
6772,6772,Canadian,Zolitude,Paige Cooper,WINNER OF THE 2018 QUEBEC WRITERS' FEDERATION ...
6773,6773,Canadian,Zoo and Crowbar,David Zieroth,The Wind has mysteriously caused the death of ...


### International

In [82]:
international = pd.read_csv('../data/processed/international_books.csv', encoding = "ISO-8859-1")

In [83]:
international.head()

Unnamed: 0,id,origin,title,author,description
0,6775,international,3:59,Gretchen Mcneil,<p>From <b>Possess</b> and <b>Ten</b> author G...
1,6776,international,125,000 copies sold in its first edition<br>,
2,6777,international,1776,David Mccullough,"In this masterful book, David McCullough tells..."
3,6778,international,1984,George Orwell,&lt;b&gt;Renowned urban artist Shepard Fairey'...
4,6779,international,2312,Kim Stanley Robinson,The Hugo and Nebula nominated and <i>New York ...


In [84]:
international.shape

(10806, 5)

In [85]:
international.dropna(axis = 0, how ='any', inplace = True)

In [86]:
international.shape

(10704, 5)

In [87]:
international.head(50)

Unnamed: 0,id,origin,title,author,description
0,6775.0,international,3:59,Gretchen Mcneil,<p>From <b>Possess</b> and <b>Ten</b> author G...
2,6777.0,international,1776,David Mccullough,"In this masterful book, David McCullough tells..."
3,6778.0,international,1984,George Orwell,&lt;b&gt;Renowned urban artist Shepard Fairey'...
4,6779.0,international,2312,Kim Stanley Robinson,The Hugo and Nebula nominated and <i>New York ...
5,6780.0,international,2666,Roberto Bolaño,&lt;p&gt;&lt;b&gt;A NATIONAL BOOK CRITICS CIRC...
6,6791.0,international,$20 Per Gallon: How The Inevitable Rise In The...,Christopher Steiner,Imagine an everyday world in which the price o...
7,6792.0,international,"$3 Meals: Feed Your Family Delicious, Healthy ...",Ellen Brown,"<p class=""null1"">250+ cost-busting, simple, he..."
8,,international,. . . And His Lovely Wife: A Campaign Memoir F...,Connie Schultz,"<p>Writing with warmth and humor, Connie Schul..."
9,6802.0,international,. . . If You Were There When They Signed The C...,Elizabeth Levy,<p>If you were there when they signed the Cons...
10,6803.0,international,...and A Hard Rain Fell: A Gi's True Story Of ...,John Ketwig,"""A magnetic, bloody, moving, and worm's-eye vi..."


In [88]:
books = pd.concat([canadian, international])

In [90]:
books.head()

Unnamed: 0,id,origin,title,author,description
0,0,Canadian,01 Nathaniel Mcdaniel and Bigbeards Hook,Evan Solomon,"Meet Nathaniel McDaniel, the mischievous hero ..."
1,1,Canadian,02 Standard of Honor Book Two of the Templar T...,Jack Whyte,Jack Whyte’s thrilling Templar Trilogy continu...
2,2,Canadian,03 Knights Templar Order in Chaos,Jack Whyte,"In the final novel in the Templar Trilogy, Wil..."
3,3,Canadian,100 Easy-to-Grow Native Plants for Canadian Ga...,Lorraine Johnson,The key to a carefree garden is to know which ...
4,4,Canadian,10 Women,George Bowering,Ten Women is a new collection of short fiction...


In [92]:
books.to_csv('../data/processed/books.csv')