# #readMoreCanlit | Notebook 3: Data cleaning

### Imports

In [1]:
# pandas and numpy
import pandas as pd
import numpy as np

# nltk imports
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

# sci-kit learn imports
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.pipeline import Pipeline, make_pipeline

# Presentation and visuals
import seaborn as sns
import matplotlib.pyplot as plt

# This magic line will allow you to generate plots
# within the Jupyter notebook.
%matplotlib inline
from pprint import pprint
pd.options.display.max_seq_items = 2000
pd.options.display.max_rows = 4000
# pd.set_option(display.max_columns), None

# other imports
import json
import lxml
from lxml import html
import random
import regex as re
import requests
import time
import urllib.request
from datetime import datetime



# Data cleaning

In [2]:
# Read in the Canadian data

canadian = pd.read_csv('../data/processed/canadian_books.csv')
canadian = canadian.applymap(str)
canadian.shape

(8004, 5)

In [3]:
canadian

Unnamed: 0,id,title,author,description,image
0,0,01 Nathaniel Mcdaniel and Bigbeards Hook,Evan Solomon,"Meet Nathaniel McDaniel, the mischievous hero ...",https://images.49thshelf.com/var/ezflow_site/s...
1,1,02 Standard of Honor Book Two of the Templar T...,Jack Whyte,Jack Whyte’s thrilling Templar Trilogy continu...,https://images.49thshelf.com/var/ezflow_site/s...
2,2,03 Knights Templar Order in Chaos,Jack Whyte,"In the final novel in the Templar Trilogy, Wil...",https://images.49thshelf.com/var/ezflow_site/s...
3,3,100 Easy-to-Grow Native Plants for Canadian Ga...,Lorraine Johnson,The key to a carefree garden is to know which ...,https://images.49thshelf.com/var/ezflow_site/s...
4,4,10 Women,George Bowering,Ten Women is a new collection of short fiction...,https://images.49thshelf.com/var/ezflow_site/s...
...,...,...,...,...,...
7999,7999,Zero Day,Ezekiel Boone,"The wildly entertaining, deeply satisfying fin...",https://images.49thshelf.com/var/ezflow_site/s...
8000,8000,Zip's File,Shannon Maguire,Zip's File: A Romance of Silence explores the ...,https://images.49thshelf.com/var/ezflow_site/s...
8001,8001,Zolitude,Paige Cooper,WINNER OF THE 2018 QUEBEC WRITERS' FEDERATION ...,https://images.49thshelf.com/var/ezflow_site/s...
8002,8002,Zoo and Crowbar,David Zieroth,The Wind has mysteriously caused the death of ...,https://images.49thshelf.com/var/ezflow_site/s...


In [4]:
canadian.drop_duplicates(inplace = True) 

In [5]:
canadian

Unnamed: 0,id,title,author,description,image
0,0,01 Nathaniel Mcdaniel and Bigbeards Hook,Evan Solomon,"Meet Nathaniel McDaniel, the mischievous hero ...",https://images.49thshelf.com/var/ezflow_site/s...
1,1,02 Standard of Honor Book Two of the Templar T...,Jack Whyte,Jack Whyte’s thrilling Templar Trilogy continu...,https://images.49thshelf.com/var/ezflow_site/s...
2,2,03 Knights Templar Order in Chaos,Jack Whyte,"In the final novel in the Templar Trilogy, Wil...",https://images.49thshelf.com/var/ezflow_site/s...
3,3,100 Easy-to-Grow Native Plants for Canadian Ga...,Lorraine Johnson,The key to a carefree garden is to know which ...,https://images.49thshelf.com/var/ezflow_site/s...
4,4,10 Women,George Bowering,Ten Women is a new collection of short fiction...,https://images.49thshelf.com/var/ezflow_site/s...
...,...,...,...,...,...
7999,7999,Zero Day,Ezekiel Boone,"The wildly entertaining, deeply satisfying fin...",https://images.49thshelf.com/var/ezflow_site/s...
8000,8000,Zip's File,Shannon Maguire,Zip's File: A Romance of Silence explores the ...,https://images.49thshelf.com/var/ezflow_site/s...
8001,8001,Zolitude,Paige Cooper,WINNER OF THE 2018 QUEBEC WRITERS' FEDERATION ...,https://images.49thshelf.com/var/ezflow_site/s...
8002,8002,Zoo and Crowbar,David Zieroth,The Wind has mysteriously caused the death of ...,https://images.49thshelf.com/var/ezflow_site/s...


In [10]:
canadian.dtypes

id             object
title          object
author         object
description    object
image          object
dtype: object

In [8]:
canadian.to_csv('../data/processed/canadian_ready_for_model.csv')