# Scraping with Pandas

In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy import table, column

## Scraping from wiki sites

In [2]:
#We can use the `read_html` function in Pandas to automatically scrape any tabular data from a page.
#listing the urls for needed data
url1 = 'https://en.wikipedia.org/wiki/Animal_Crossing'
url2 = 'https://en.wikipedia.org/wiki/Characters_in_the_Animal_Crossing_series'
url3=  'https://animalcrossing.fandom.com/wiki/Category:Special_characters'


In [3]:
#scraping the data from wiki/Animal_Crossing
tables = pd.read_html(url1)
tables

# What we get in return is a list of dataframes for any tabular data that Pandas found stored in tables dataframe
# pulling required data from the second index in the , 'tables'list
Sales_summary = tables[2]
Sales_summary.columns = ['Name', 'Year.', 'Units_Sold', 'Critic_Score']

#We can slice off any of those dataframes that we want using normal indexing.
Sales_summary["Units_Sold"] = Sales_summary["Units_Sold"].str.split("[", n = 1, expand = True)[0]
Sales_summary["Critic_Score"] = Sales_summary["Critic_Score"].str.split("/", n = 1, expand = True)[0]
Sales_summary.head()

Unnamed: 0,Name,Year.,Units_Sold,Critic_Score
0,Animal Crossing,2001,2.32,87
1,Animal Crossing: Wild World,2005,11.75,86
2,Animal Crossing: City Folk,2008,3.38,73
3,Animal Crossing: New Leaf,2012,12.45,88
4,Animal Crossing: New Horizons,2020,13.41,90


In [4]:
#scraping the data from 'https://en.wikipedia.org/wiki/Characters_in_the_Animal_Crossing_series'
tables1 = pd.read_html(url2)
tables1

[        Character                                Original Name  \
 0         Alfonso                               アルベルト (Albert)   
 1          Apollo                                  アポロ (Aporo)   
 2          Blanca               あやしいねこ (lit. "Suspicious Cat")   
 3        Blathers                                 フータ (Hooter)   
 4          Booker  おまわりさんB・もんばんさんA (Policeman B; Gatekeeper A)   
 5           Cesar                                   アラン (Alan)   
 6            Chip                                うおまさ (Uomasa)   
 7          Copper  おまわりさんA・もんばんさんB (Policeman A; Gatekeeper B)   
 8          Cyrano                           さくらじま (Sakurajima)   
 9        Blathers                                 フータ (Hooter)   
 10    Don Resetti                         ラケットさん (Raketto-san)   
 11         Farley                                 ファーリー (Fārī)   
 12       Franklin                          フランクリン (Furankurin)   
 13         Gracie                                 グレース (Grace

In [5]:
#Extracting and storing the required data froms tables1
Animal_Forest = tables1[0]
Wild_World = tables1[1]
City_Folk = tables1[2]
New_Leaf = tables1[3]
New_Horizons = tables1[4]

#Naming the columns for each dataframe
Animal_Forest.columns = ['Character', 'Original_Name', 'Species', 'Description']
Wild_World.columns = ['Character', 'Orginal_Name.', 'Species', 'Description']
City_Folk.columns = ['Character', 'Orginal_Name.', 'Species', 'Description']
New_Leaf.columns = ['Character', 'Orginal_Name.', 'Species', 'Description']
New_Horizons.columns = ['Character', 'Orginal_Name.', 'Species', 'Description']

#extracting required columns from each dataframe
Animal_Forest['Name']='Animal_Forest'
Wild_World['Name']='Wild_World'
City_Folk['Name']='City_Folk'
New_Leaf['Name']='New_Leaf'
New_Horizons['Name']='New_Horizons'

#creating a new dataframe characters from the extracted columns
characters=pd.concat([Animal_Forest, Wild_World,City_Folk,New_Leaf,New_Horizons]).drop_duplicates()
characters= characters[['Character', 'Species', 'Description']]
characters

Unnamed: 0,Character,Species,Description
0,Alfonso,Alligator,A lazy villager. Also appears in the 2006 film...
1,Apollo,Eagle,A grumpy villager. Also appears in the 2006 fi...
2,Blanca,Cat,A faceless cat who the player to draw their fa...
3,Blathers,Owl,Operates the museum. Brother to Celeste.
4,Booker,Dog (bulldog),Works as a police officer in Animal Crossing a...
...,...,...,...
1,Daisy Mae,Boar,Sells turnips. Granddaughter of Joan.
2,Flick,Chameleon,Hosts the Bug-Off. Roommate and business partn...
3,Orville,Dodo,Works as the receptionist for Dodo Airlines. B...
4,Raymond,Cat,A smug villager. Noted by outlets as being esp...


In [6]:
#scraping the data from https://animalcrossing.fandom.com/wiki/Category:Special_characters
tables = pd.read_html(url3)
Special_Character = tables[0]

Special_Character.columns = ['Name', 'Image.', 'Species', 'Birthday','service']
Special_Character=Special_Character[['Name',  'Species', 'Birthday','service']]
Special_Character

Unnamed: 0,Name,Species,Birthday,service
0,Blanca,Cat,February 8th,"April Fools Day Host (NL) Town Visitor (AF, AC..."
1,Blathers,Owl,September 24th,"Museum Curator (AF, AC, WW, CF, NL, NH)"
2,Booker,Dog (Bulldog),April 23rd,"Police Station Officer (AC, NL) Gate Guard (WW..."
3,Brewster,Pigeon,October 15th,"The Roost Shopkeeper (WW, CF, NL, NH) Gyroid ..."
4,C.J.,Beaver,March 7th,Fishing Tourney Host (NH)
...,...,...,...,...
60,Tortimer,Tortoise,May 30th,"Mayor (AF, AC, WW, CF) Animal Island Games Dir..."
61,Wendell,Walrus,May 30th,"Town Visitor (AF, AC, WW, CF) Dream Visitor (NL)"
62,Wilbur,Dodo,,Dodo Airlines Pilot (NH)
63,Wisp,Spirit,May 30th,"Town Visitor (AF, AC, NH) Lamp Genie (CF, NL)"


### Sourcing the data from kaggle.com

In [7]:
#loading critic_review
csv_file = "Resources/critic.csv"
critic_df = pd.read_csv(csv_file)
critic_df.count()

grade          107
publication    107
text           107
date           107
dtype: int64

In [9]:
#checking the dataframe headers
critic_df

Unnamed: 0,grade,publication,text,date
0,100,Pocket Gamer UK,"Animal Crossing; New Horizons, much like its p...",3/16/2020
1,100,Forbes,Know that if you’re overwhelmed with the world...,3/16/2020
2,100,Telegraph,"With a game this broad and lengthy, there’s mo...",3/16/2020
3,100,VG247,Animal Crossing: New Horizons is everything I ...,3/16/2020
4,100,Nintendo Insider,"Above all else, Animal Crossing: New Horizons ...",3/16/2020
...,...,...,...,...
102,90,Impulsegamer,Animal Crossing New Horizons is pure fun and p...,4/16/2020
103,90,PLAY! Zine,Animal Crossing: New Horizons is a definitive ...,4/17/2020
104,95,GameGrin,New Horizons is simply the best Animal Crossin...,4/22/2020
105,90,NF Magazine,I can't wait to see what the future will hold....,5/1/2020


In [10]:
#creating new dataframe with required columns
new_critic_df = critic_df[['grade', 'publication', 'date']].copy()
new_crtic_df = new_critic_df.drop_duplicates()
new_critic_df

Unnamed: 0,grade,publication,date
0,100,Pocket Gamer UK,3/16/2020
1,100,Forbes,3/16/2020
2,100,Telegraph,3/16/2020
3,100,VG247,3/16/2020
4,100,Nintendo Insider,3/16/2020
...,...,...,...
102,90,Impulsegamer,4/16/2020
103,90,PLAY! Zine,4/17/2020
104,95,GameGrin,4/22/2020
105,90,NF Magazine,5/1/2020


In [11]:
#loading items
csv_file = "Resources/items.csv"
items_df = pd.read_csv(csv_file)
items_df.head()
len(items_df)

new_items_df = items_df[['num_id', 'name', 'category', 'sell_value', 'sell_currency', 'buy_value', 'buy_currency']].copy()
new_items_df.drop_duplicates(inplace=True)

new_items_df = new_items_df.dropna()
new_items_df

Unnamed: 0,num_id,name,category,sell_value,sell_currency,buy_value,buy_currency
0,12,3D Glasses,Accessories,122.0,bells,490.0,bells
1,14,A Tee,Tops,140.0,bells,560.0,bells
2,17,Abstract Wall,Wallpaper,390.0,bells,1560.0,bells
3,19,Academy Uniform,Dresses,520.0,bells,2080.0,bells
5,21,Accessories Stand,Furniture,375.0,bells,1500.0,bells
...,...,...,...,...,...,...,...
4560,7425,Zigzag Shirt,Tops,240.0,bells,960.0,bells
4561,7428,Zipper's Poster,Photos,250.0,bells,1000.0,bells
4562,7441,Zori,Shoes,1075.0,bells,4300.0,bells
4563,7442,Zucker's Photo,Photos,10.0,bells,40.0,bells


In [12]:
#read user review file and load it into datframe
file = "Resources/user_reviews.csv"
user_df = pd.read_csv(file)
user_df

#identifying incomplete rows
user_df.count()

##output for df count shows that there are no rows with null values

#Clean the user_review dataframe and keeping required columns
#drop the user_name column and any duplicate records
user_review_df=user_df[['grade', 'text', 'date']].copy()
user_review_df.drop_duplicates(inplace=True)
user_review_df


Unnamed: 0,grade,text,date
0,4,My gf started playing before me. No option to ...,3/20/2020
1,5,"While the game itself is great, really relaxin...",3/20/2020
2,0,My wife and I were looking forward to playing ...,3/20/2020
3,0,We need equal values and opportunities for all...,3/20/2020
4,0,BEWARE! If you have multiple people in your h...,3/20/2020
...,...,...,...
2994,1,1 Island for console limitation.I cannot play ...,5/3/2020
2995,1,"Per giocare con figli o fidanzate, mogli o per...",5/3/2020
2996,0,One island per console is a pathetic limitatio...,5/3/2020
2997,2,Even though it seems like a great game with ma...,5/3/2020


In [13]:
#read villagers file and load it into datframe
file = "Resources/villagers.csv"
villagers_df = pd.read_csv(file)
villagers_df

#identifying incomplete rows
villagers_df.count()

##above result shows that song and id columns have null values
#drop the rows with null values
villagers_df.dropna(inplace=True)
villagers_df.count()

#drop duplicates
villagers_df.drop_duplicates(inplace = True)
villagers_df.count()

villagers_df.head(10)

#Clean dataframe
villagers_data_df=villagers_df[['name','gender','species', 'birthday', 'personality', 'song', 'phrase']].copy()
villagers_data_df

Unnamed: 0,name,gender,species,birthday,personality,song,phrase
0,Admiral,male,bird,27-Jan,cranky,Steep Hill,aye aye
1,Agent S,female,squirrel,2-Jul,peppy,DJ K.K.,sidekick
2,Agnes,female,pig,21-Apr,uchi,K.K. House,snuffle
3,Al,male,gorilla,18-Oct,lazy,Steep Hill,Ayyeeee
4,Alfonso,male,alligator,9-Jun,lazy,Forest Life,it'sa me
...,...,...,...,...,...,...,...
386,Winnie,female,horse,31-Jan,peppy,My Place,hay-OK
387,Wolfgang,male,wolf,25-Nov,cranky,K.K. Song,snarrrl
388,Yuka,female,koala,20-Jul,snooty,Soulful K.K.,tsk tsk
389,Zell,male,deer,7-Jun,smug,K.K. D&B,pronk


In [14]:
#establish connecting with postgres database
rds_connection_string = "postgres:postgres@localhost:5432/animal_crossing_db"

engine = create_engine(f'postgresql://{rds_connection_string}')

In [15]:
#creating or appending tables as the case may be
Sales_summary.to_sql(name='sales_summary', con=engine, if_exists='append', index=False)

Special_Character.to_sql(name='special_character', con=engine, if_exists='append', index=False)

characters.to_sql(name='characters', con=engine, if_exists='append', index=False)

villagers_data_df.to_sql(name='villagers', con=engine, if_exists='append', index=False)

In [16]:
new_critic_df.to_sql(name='critic_review', con=engine, if_exists='append', index=False)

IntegrityError: (psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "critic_review_pkey"
DETAIL:  Key (publication)=(Pocket Gamer UK) already exists.

[SQL: INSERT INTO critic_review (grade, publication, date) VALUES (%(grade)s, %(publication)s, %(date)s)]
[parameters: ({'grade': 100, 'publication': 'Pocket Gamer UK', 'date': '3/16/2020'}, {'grade': 100, 'publication': 'Forbes', 'date': '3/16/2020'}, {'grade': 100, 'publication': 'Telegraph', 'date': '3/16/2020'}, {'grade': 100, 'publication': 'VG247', 'date': '3/16/2020'}, {'grade': 100, 'publication': 'Nintendo Insider', 'date': '3/16/2020'}, {'grade': 100, 'publication': 'Trusted Reviews', 'date': '3/16/2020'}, {'grade': 100, 'publication': 'VGC', 'date': '3/16/2020'}, {'grade': 100, 'publication': 'God is a Geek', 'date': '3/16/2020'}  ... displaying 10 of 107 total bound parameter sets ...  {'grade': 90, 'publication': 'NF Magazine', 'date': '5/1/2020'}, {'grade': 80, 'publication': 'PCMag', 'date': '5/1/2020'})]
(Background on this error at: http://sqlalche.me/e/gkpj)

In [17]:
user_review_df.to_sql(name='user_reviews', con=engine, if_exists='append', index=False)

In [18]:
new_items_df.to_sql(name='items', con=engine, if_exists='append', index=False)

In [19]:
engine.table_names()

['critic_review',
 'sales_summary',
 'special_character',
 'villagers',
 'characters',
 'user_reviews',
 'items']

In [None]:
pd.read_sql_query('select * from sales_summary', con=engine).head(10)

pd.read_sql_query('select * from characters', con=engine).head()

pd.read_sql_query('select * from special_character', con=engine).head()
pd.read_sql_query('select * from villagers', con=engine).head()

In [21]:
pd.read_sql_query('select * from critic_review', con=engine)

Unnamed: 0,grade,publication,date
0,100,Pocket Gamer UK,3/16/2020
1,100,Forbes,3/16/2020
2,100,Telegraph,3/16/2020
3,100,VG247,3/16/2020
4,100,Nintendo Insider,3/16/2020
...,...,...,...
102,90,Impulsegamer,4/16/2020
103,90,PLAY! Zine,4/17/2020
104,95,GameGrin,4/22/2020
105,90,NF Magazine,5/1/2020
