# Scraping with Pandas

In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy import table, column

We can use the `read_html` function in Pandas to automatically scrape any tabular data from a page.

In [2]:
url1 = 'https://en.wikipedia.org/wiki/Animal_Crossing'
url2 = 'https://en.wikipedia.org/wiki/Characters_in_the_Animal_Crossing_series'
url3=  'https://animalcrossing.fandom.com/wiki/Category:Special_characters'

In [3]:
tables = pd.read_html(url1)
tables

[               Animal Crossing  \
 0  Animal Crossing series logo   
 1                     Genre(s)   
 2                 Developer(s)   
 3                 Publisher(s)   
 4                   Creator(s)   
 5                  Platform(s)   
 6                First release   
 7               Latest release   
 8                    Spin-offs   
 
                                    Animal Crossing.1  
 0                        Animal Crossing series logo  
 1                                  Social simulation  
 2                     Nintendo EADNintendo EPDNDcube  
 3                                           Nintendo  
 4                 Katsuya EguchiHisashi Nogami[1][2]  
 5  Nintendo 64iQue PlayerGameCubeWiiWii UNintendo...  
 6                      Animal CrossingApril 14, 2001  
 7        Animal Crossing: New HorizonsMarch 20, 2020  
 8      Happy Home DesignerAmiibo FestivalPocket Camp  ,
        0                          1
 0   2001            Animal Crossing
 1   2002    

What we get in return is a list of dataframes for any tabular data that Pandas found.

We can slice off any of those dataframes that we want using normal indexing.

In [5]:
Sales_summary = tables[2]

Sales_summary.columns = ['Name', 'Year.', 'Units_Sold', 'Critic_Score']
Sales_summary.head()

Unnamed: 0,Name,Year.,Units_Sold,Critic_Score
0,Animal Crossing,2001,2.32[36][37],87/100[38]
1,Animal Crossing: Wild World,2005,11.75[39],86/100[40]
2,Animal Crossing: City Folk,2008,3.38[41],73/100[42]
3,Animal Crossing: New Leaf,2012,12.45[43],88/100[44]
4,Animal Crossing: New Horizons,2020,13.41 [45],90/100[46]


Cleanup of extra rows

In [None]:
tables1 = pd.read_html(url2)
tables1

In [None]:
Animal_Forest = tables1[0]
Wild_World = tables1[1]
City_Folk = tables1[2]
New_Leaf = tables1[3]
New_Horizons = tables1[4]

Animal_Forest.columns = ['Character', 'Orginal_Name.', 'Species', 'Description']
Wild_World.columns = ['Character', 'Orginal_Name.', 'Species', 'Description']
City_Folk.columns = ['Character', 'Orginal_Name.', 'Species', 'Description']
New_Leaf.columns = ['Character', 'Orginal_Name.', 'Species', 'Description']
New_Horizons.columns = ['Character', 'Orginal_Name.', 'Species', 'Description']

Animal_Forest['Name']='Animal_Forest'
Wild_World['Name']='Wild_World'
City_Folk['Name']='City_Folk'
New_Leaf['Name']='New_Leaf'
New_Horizons['Name']='New_Horizons'

characters=pd.concat([Animal_Forest, Wild_World,City_Folk,New_Leaf,New_Horizons])
characters


Set the index to the `State` column

In [None]:
tables = pd.read_html(url3)
Special_Character = tables[0]
tables
Special_Character.columns = ['Name', 'Image.', 'Species', 'Birthday','service']
Special_Character=Special_Character[['Name',  'Species', 'Birthday','service']]
Special_Character

In [None]:
rds_connection_string = "postgress:Shri@123@localhost:5432/animal_crossing_db"
rds_connection_string
engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
engine.table_names()

In [None]:
Sales_summary.to_sql(name='Sales_summary', con=engine, if_exists='append', index=False)

In [None]:
Special_Character.to_sql(name='special_character', con=engine, if_exists='append', index=False)

In [None]:
character.to_sql(name='character', con=engine, if_exists='append', index=False)