In [1]:
import pandas as pd
pd.__version__

'2.1.4'

In [2]:
df = pd.read_csv("taxi+_zone_lookup.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   LocationID    265 non-null    int64 
 1   Borough       265 non-null    object
 2   Zone          264 non-null    object
 3   service_zone  263 non-null    object
dtypes: int64(1), object(3)
memory usage: 8.4+ KB


In [4]:
df.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


Creating schema for our database table with the help of pandas. Pandas can output the ***DDL (Data definition language)*** instructions necessary to create the _schema_.

In [5]:
# name argument is used for defining the table name.
print(pd.io.sql.get_schema(df, name='taxi_zones'))

CREATE TABLE "taxi_zones" (
"LocationID" INTEGER,
  "Borough" TEXT,
  "Zone" TEXT,
  "service_zone" TEXT
)


Now we have got the schema but to perform any sort of task on database we need to connect to the database.

For creating the connection we'll use the `sqlalchemy` module. We need to make sure `sqlalchemy` & `psycopg2` has already been installed.

In [6]:
from sqlalchemy import create_engine

An **engine** specifies the details used for connecting to db. The code structure is:

`engine = create_engine('database://user:password@host:port/database_name')`

In [9]:
# Make sure the database is ready for accepting connection.
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

# Testing connection
try:
    with engine.connect() as connection_str:
        print('Successfully connected to the PostgreSQL database')
except Exception as ex:
    print(f'Sorry failed to connect: {ex}')

Successfully connected to the PostgreSQL database


#### Now we can use our engine to get the specific schema for postgres.

In [13]:
# Connecting to our engine gives us the schema defined for postgres.
print(pd.io.sql.get_schema(df, name='taxi_zones', con=engine))


CREATE TABLE taxi_zones (
	"LocationID" BIGINT, 
	"Borough" TEXT, 
	"Zone" TEXT, 
	service_zone TEXT
)




### Creating table in database.

In [21]:
# Provide table name for the database, the connection and what to do if the table exists.
# We have choosen the replace which would overwrite the table if exists before.
# df.head(0) would only define the heading of a df. i.e. Only the first row of dataframe.
df.head(0).to_sql(name='taxi_zones', con=engine, if_exists='replace', index=False)

0

#### Now if we check our database then we can find the table.

### Now writing the datas to the database table.

`%time` will define the time taken during execution of the command.

In [22]:
%time
df.to_sql(name='taxi_zones', con=engine, if_exists='append', index=False)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 7.63 µs


265

#### Executing simple query 

In [23]:
# SQL query to select specific columns and filter data
query = """
SELECT *
FROM taxi_zones;
"""

# Execute the query and store results in a DataFrame
pd.read_sql_query(query, con=engine)

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone
261,262,Manhattan,Yorkville East,Yellow Zone
262,263,Manhattan,Yorkville West,Yellow Zone
263,264,Unknown,NV,
