In [5]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import numpy as np

In [6]:
URL = "https://www.passiton.com/inspirational-quotes?page="
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html5lib')

quotes=[]  # a list to store quotes
table = soup.find('div', attrs = {'id':'all_quotes'}) 

# Class thay col-lg-3 thành col-lg-4, vì trang đã đổi class
for row in table.findAll('div', attrs = {'class':'col-6 col-lg-4 text-center margin-30px-bottom sm-margin-30px-top'}):
   quote = {}
   quote['theme'] = row.h5.text
   quote['url'] = row.a['href']
   quote['img'] = row.img['src']
   quote['lines'] = row.img['alt'].split(" #")[0]
   quote['author'] = row.img['alt'].split(" #")[1]
   quotes.append(quote)


In [7]:
filename = 'raw_data.csv'
with open(filename, 'w', newline='') as f:
    w = csv.DictWriter(f,['theme','url','img','lines','author'])
    w.writeheader()
    for quote in quotes:
        w.writerow(quote)

In [8]:
df = pd.read_csv('raw_data.csv', header=0, delimiter=',', encoding='UTF-8')
df.head(10)

Unnamed: 0,theme,url,img,lines,author
0,CARING,/inspirational-quotes/8393-i-cannot-do-all-the...,https://assets.passiton.com/quotes/quote_artwo...,I cannot do all the good that the world needs....,<Author:0x00007f170f166f70>
1,CARING,/inspirational-quotes/7557-caring-about-the-ha...,https://assets.passiton.com/quotes/quote_artwo...,"Caring about the happiness of others, we find ...",<Author:0x00007f170f1c6998>
2,CARING,/inspirational-quotes/3278-without-a-sense-of-...,https://assets.passiton.com/quotes/quote_artwo...,"Without a sense of caring, there can be no sen...",<Author:0x00007f170f1b4950>
3,CARING,/inspirational-quotes/3505-unless-someone-like...,https://assets.passiton.com/quotes/quote_artwo...,Unless someone like you cares a whole awful lo...,<Author:0x00007f170ee2ead8>
4,CARING,/inspirational-quotes/3203-the-capacity-to-car...,https://assets.passiton.com/quotes/quote_artwo...,The capacity to care is the thing which gives ...,<Author:0x00007f170f1ec2d8>
5,CURIOSITY,/inspirational-quotes/6799-above-all-watch-wit...,https://assets.passiton.com/quotes/quote_artwo...,"Above all, watch with glittering eyes the whol...",<Author:0x00007f170f1d5df8>
6,CURIOSITY,/inspirational-quotes/4543-be-curious-not-judg...,https://assets.passiton.com/quotes/quote_artwo...,"Be curious, not judgmental.",<Author:0x00007f170f21f700>
7,CURIOSITY,/inspirational-quotes/8329-curiosity-is-more-i...,https://assets.passiton.com/quotes/quote_artwo...,Curiosity is more important than knowledge.,<Author:0x00007f170f20d708>
8,CURIOSITY,/inspirational-quotes/8328-i-think-at-a-childs...,https://assets.passiton.com/quotes/quote_artwo...,"I think, at a child's birth, if a mother could...",<Author:0x00007f170f257240>
9,CURIOSITY,/inspirational-quotes/4014-somewhere-something...,https://assets.passiton.com/quotes/quote_artwo...,"Somewhere, something incredible is waiting to ...",<Author:0x00007f170f244fa0>


In [9]:
# Xử lý dữ liệu rỗng
# Xóa bỏ các dòng dữ liệu rỗng
df.dropna(how='all', inplace=True)

# Xử lý dữ liệu trùng
# Xóa bỏ các dòng bị trùng
df.drop_duplicates(inplace=True)

# Xử lý dữ liệu sai định dạng
df = df.astype(str)

# # Xử lý dữ liệu lỗi Unicode
df['theme'] = df['theme'].apply(lambda x: x.encode('utf-8', 'ignore').decode('utf-8'))
df['url'] = df['url'].apply(lambda x: x.encode('utf-8', 'ignore').decode('utf-8'))
df['img'] = df['img'].apply(lambda x: x.encode('utf-8', 'ignore').decode('utf-8'))
df['lines'] = df['lines'].apply(lambda x: x.encode('utf-8', 'ignore').decode('utf-8'))
df['author'] = df['author'].apply(lambda x: x.encode('utf-8', 'ignore').decode('utf-8'))

# Xử lý dữ liệu chứa nhiều thông tin cần tách ra

# Xử lý dữ liệu thiếu
df['theme'].fillna(df['theme'].mode()[0], inplace=True)
df['url'].fillna(df['url'].mode()[0], inplace=True)
df['img'].fillna(df['img'].mode()[0], inplace=True)
df['lines'].fillna(df['lines'].mode()[0], inplace=True)
df['author'].fillna(df['author'].mode()[0], inplace=True)

In [10]:
df.head()

Unnamed: 0,theme,url,img,lines,author
0,CARING,/inspirational-quotes/8393-i-cannot-do-all-the...,https://assets.passiton.com/quotes/quote_artwo...,I cannot do all the good that the world needs....,<Author:0x00007f170f166f70>
1,CARING,/inspirational-quotes/7557-caring-about-the-ha...,https://assets.passiton.com/quotes/quote_artwo...,"Caring about the happiness of others, we find ...",<Author:0x00007f170f1c6998>
2,CARING,/inspirational-quotes/3278-without-a-sense-of-...,https://assets.passiton.com/quotes/quote_artwo...,"Without a sense of caring, there can be no sen...",<Author:0x00007f170f1b4950>
3,CARING,/inspirational-quotes/3505-unless-someone-like...,https://assets.passiton.com/quotes/quote_artwo...,Unless someone like you cares a whole awful lo...,<Author:0x00007f170ee2ead8>
4,CARING,/inspirational-quotes/3203-the-capacity-to-car...,https://assets.passiton.com/quotes/quote_artwo...,The capacity to care is the thing which gives ...,<Author:0x00007f170f1ec2d8>


In [11]:
df.to_csv('clean_data.csv')

In [None]:
#DONE