In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
iris = pd.read_csv("http://gagolewski.com/resources/data/iris.csv", comment="#")
tips = pd.read_csv("http://gagolewski.com/resources/data/tips.csv", comment="#")
vehicles = pd.read_csv("http://gagolewski.com/resources/data/fueleconomy_vehicles.csv.gz", comment="#")
birth_dates = pd.read_csv("http://www.gagolewski.com/resources/data/birth_dates.csv", comment="#")
weather = pd.read_csv("http://www.gagolewski.com/resources/data/nycflights13_weather.csv.gz", comment="#")
flights = pd.read_csv("http://www.gagolewski.com/resources/data/nycflights13_flights.csv.gz", comment="#")
airports = pd.read_csv("http://www.gagolewski.com/resources/data/nycflights13_airports.csv.gz", comment="#")
airlines = pd.read_csv("http://www.gagolewski.com/resources/data/nycflights13_airlines.csv.gz", comment="#")
planes = pd.read_csv("http://www.gagolewski.com/resources/data/nycflights13_planes.csv.gz", comment="#")

## New York City Flights examples
Here, we will work on exercises to switch back and forth between SQL queries and python (pandas) syntax for pulling the same data.

In [9]:
# NYC flights 2013
flights.shape

(336776, 19)

In [10]:
flights.head(1)

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00


In [11]:
airlines.tail(5)

Unnamed: 0,carrier,name
11,UA,United Air Lines Inc.
12,US,US Airways Inc.
13,VX,Virgin America
14,WN,Southwest Airlines Co.
15,YV,Mesa Airlines Inc.


In [12]:
planes.head(1)

Unnamed: 0,tailnum,year,type,manufacturer,model,engines,seats,speed,engine
0,N10156,2004.0,Fixed wing multi engine,EMBRAER,EMB-145XR,2,55,,Turbo-fan


In [13]:
airports.head()

Unnamed: 0,faa,name,lat,lon,alt,tz,dst,tzone
0,04G,Lansdowne Airport,41.130472,-80.619583,1044,-5,A,America/New_York
1,06A,Moton Field Municipal Airport,32.460572,-85.680028,264,-6,A,America/Chicago
2,06C,Schaumburg Regional,41.989341,-88.101243,801,-6,A,America/Chicago
3,06N,Randall Airport,41.431912,-74.391561,523,-5,A,America/New_York
4,09J,Jekyll Island Airport,31.074472,-81.427778,11,-5,A,America/New_York


In [14]:
weather.head()

Unnamed: 0,origin,year,month,day,hour,temp,dewp,humid,wind_dir,wind_speed,wind_gust,precip,pressure,visib,time_hour
0,EWR,2013,1,1,0,37.04,21.92,53.97,230.0,10.35702,11.918651,0.0,1013.9,10.0,2013-01-01 01:00:00
1,EWR,2013,1,1,1,37.04,21.92,53.97,230.0,13.80936,15.891535,0.0,1013.0,10.0,2013-01-01 02:00:00
2,EWR,2013,1,1,2,37.94,21.92,52.09,230.0,12.65858,14.567241,0.0,1012.6,10.0,2013-01-01 03:00:00
3,EWR,2013,1,1,3,37.94,23.0,54.51,230.0,13.80936,15.891535,0.0,1012.7,10.0,2013-01-01 04:00:00
4,EWR,2013,1,1,4,37.94,24.08,57.04,240.0,14.96014,17.21583,0.0,1012.8,10.0,2013-01-01 05:00:00


In [15]:
import sqlite3

In [16]:
conn = sqlite3.connect("/tmp/nycflights.db") 

In [17]:
flights.to_sql("flights", conn)
weather.to_sql("weather", conn)
planes.to_sql("planes", conn)
airports.to_sql("airports", conn)
airlines.to_sql("airlines", conn)

In [18]:
pd.read_sql_query("""
    SELECT * FROM airlines  WHERE name LIKE '%AMERICA%'
""", conn)

Unnamed: 0,index,carrier,name
0,1,AA,American Airlines Inc.
1,13,VX,Virgin America


   1. `SELECT DISTINCT engine FROM planes`
   

In [19]:
pd.read_sql_query("""SELECT DISTINCT engine FROM planes""", conn)

Unnamed: 0,engine
0,Turbo-fan
1,Turbo-jet
2,Reciprocating
3,4 Cycle
4,Turbo-shaft
5,Turbo-prop


Python: 

   2. `SELECT DISTINCT type, engine FROM planes`
   

Python:

   3. `SELECT COUNT(*), engine FROM planes GROUP BY engine`
   

Python:

   4. `SELECT COUNT(*), engine, type FROM planes GROUP BY engine, type`
   

Python:

   5. `SELECT MIN(year), AVG(year), MAX(year), engine, manufacturer FROM planes GROUP BY engine, manufacturer`

Python:

   6. `SELECT * FROM planes WHERE speed IS NOT NULL`
   

Python:

   7. `SELECT tailnum FROM planes WHERE seats BETWEEN 150 AND 190 AND year >= 2012`
   

Python:

   8. `SELECT * FROM planes WHERE manufacturer IN ("BOEING", "AIRBUS", "EMBRAER") AND seats>390`

Python:

   9. `SELECT DISTINCT year, seats  FROM planes WHERE year >= 2012 ORDER BY year ASC, seats DESC`
   10. `SELECT DISTINCT year, seats  FROM planes WHERE year >= 2012 ORDER BY seats DESC, year ASC`
   

   11. `SELECT manufacturer, COUNT(*) FROM planes WHERE seats > 200 GROUP BY manufacturer`


SQL:

   12. Python:
   ```
   manufacturers_of_large_planes = (
    planes.loc[planes.seats>200,:].
    groupby('manufacturer').      # DataFrameGroupBy [→GroupBy]
    size().                       # Series [values=counts, index=manufacturer]
    rename('count').              
    reset_index()
)
manufacturers_of_large_planes.loc[manufacturers_of_large_planes["count"]>10,:].\
    reset_index(drop=True)```

SQL:

   13. Python:    ```
   planes.groupby("manufacturer").size().rename('howmany').reset_index().\
    sort_values('howmany', ascending=False).reset_index(drop=True).head(5)
planes.groupby("manufacturer").size().rename('howmany').\
    sort_values(ascending=False).reset_index().head(5)
planes.groupby("manufacturer").size().rename('howmany').\
    nlargest(5).reset_index()
   ```

SQL:

   14. Python:
   ```
   pd.merge(flights, planes, how='left', on='tailnum')
   flights.merge(planes, how='left', on='tailnum')
   ```
   
   

SQL:

   15. Python:

      ```
      flights.loc[:,['carrier', 'tailnum']].drop_duplicates().\
    merge(planes, on='tailnum').merge(airlines, on='carrier')
      ```
      

SQL: