# Lab05 Python Data Science with Pandas

## Please submit your finished lab05 jupyter notebook via pull request in GitHub

In [3]:
# magic command to display matplotlib plots inline within the ipython notebook webpage
%matplotlib inline

# import necessary modules
import pandas as pd, numpy as np, matplotlib.pyplot as plt

## Part 1 <br /> Basics of Selecting and Slicing Data

In [4]:
# create a pandas dataframe from the location data set
df = pd.read_csv('data/summer-travel-gps-full.csv')
df.head()

Unnamed: 0,lat,lon,date,city,country
0,51.481292,-0.451011,05/14/2014 09:07,West Drayton,United Kingdom
1,51.474005,-0.450999,05/14/2014 09:22,Hounslow,United Kingdom
2,51.478199,-0.446081,05/14/2014 10:51,Hounslow,United Kingdom
3,51.478199,-0.446081,05/14/2014 11:24,Hounslow,United Kingdom
4,51.474146,-0.451562,05/14/2014 11:38,Hounslow,United Kingdom


In [22]:
df

Unnamed: 0,lat,lon,date,city,country
0,51.481292,-0.451011,05/14/2014 09:07,West Drayton,United Kingdom
1,51.474005,-0.450999,05/14/2014 09:22,Hounslow,United Kingdom
2,51.478199,-0.446081,05/14/2014 10:51,Hounslow,United Kingdom
3,51.478199,-0.446081,05/14/2014 11:24,Hounslow,United Kingdom
4,51.474146,-0.451562,05/14/2014 11:38,Hounslow,United Kingdom
...,...,...,...,...,...
1754,48.356013,11.791710,07/09/2014 13:13,Munich,Germany
1755,48.356529,11.792183,07/09/2014 13:14,Munich,Germany
1756,48.356285,11.791710,07/09/2014 13:17,Munich,Germany
1757,48.355328,11.791710,07/09/2014 13:18,Munich,Germany


In [41]:
# Q1: how to get 2 columns from the dataframe (city and country)?

To get a single "cell's" value out of a dataframe, pass a column name, then a row label. This is equivalent to slicing the dataframe down to a single series, then slicing a single value out of that series using [ ] indexing.

In [5]:
df [["city", "country"]]

Unnamed: 0,city,country
0,West Drayton,United Kingdom
1,Hounslow,United Kingdom
2,Hounslow,United Kingdom
3,Hounslow,United Kingdom
4,Hounslow,United Kingdom
...,...,...
1754,Munich,Germany
1755,Munich,Germany
1756,Munich,Germany
1757,Munich,Germany


In [52]:
# Q2: how to get the first 5 rows of the "city" column?

In [17]:
df.loc [[0,1,2,3,4], ["city"]]

Unnamed: 0,city
0,West Drayton
1,Hounslow
2,Hounslow
3,Hounslow
4,Hounslow


### Using .loc[ ]

In [53]:
# Q3: how to use .loc to select the third row of the dataframe?

In [13]:
df.loc [[3]]

Unnamed: 0,lat,lon,date,city,country
3,51.478199,-0.446081,05/14/2014 11:24,Hounslow,United Kingdom


In [54]:
# Q4: how to use .loc to select the first row in "country" column?

In [18]:
df.loc [[1], ["country"]]

Unnamed: 0,country
1,United Kingdom


In [55]:
# Q5: how to select the first 4 rows of ['city', 'date'] columns?

In [20]:
df.loc [[0,1,2,3],["city", "date"]]

Unnamed: 0,city,date
0,West Drayton,05/14/2014 09:07
1,Hounslow,05/14/2014 09:22
2,Hounslow,05/14/2014 10:51
3,Hounslow,05/14/2014 11:24


### Using .iloc[ ]

In [56]:
# use .iloc for integer position based indexing
# Q6: how to get the value from the row in position 3 and the column in position 2

In [21]:
df.iloc [[3],[2]]

Unnamed: 0,date
3,05/14/2014 11:24


In [57]:
# Q7: how to use iloc to select every 300th row from a data set

In [23]:
df.iloc [[300]]

Unnamed: 0,lat,lon,date,city,country
300,41.377091,2.151175,05/20/2014 03:18,Barcelona,Spain


## Part 2 <br /> How to select rows by some value(s)

In [58]:
# load a reduced set of gps data
df = pd.read_csv('data/summer-travel-gps-simplified.csv')
df.tail()

Unnamed: 0,lat,lon,date,city,country
173,41.044556,28.983286,07/08/2014 16:44,Istanbul,Turkey
174,41.008992,28.968268,07/08/2014 20:03,Istanbul,Turkey
175,41.043487,28.985488,07/08/2014 22:18,Istanbul,Turkey
176,40.977637,28.823879,07/09/2014 09:03,Istanbul,Turkey
177,48.35711,11.791346,07/09/2014 13:20,Munich,Germany


In [59]:
# Q9: create a Series of true/false, indicating if each "city" row in the column is equal to "Munich"

In [60]:
# pandas logical operators are: | for or, & for and, ~ for not
# these must be grouped by using parentheses
# Q10: what cities were visited in spain that were not barcelona? Create a dataframe for it. 

In [61]:
# Q11: select rows where either the city is munich, or the country is serbia

In [62]:
# Q12: how many observations are west of the prime meridian?

In [63]:
# Q13: get all rows that contain a city that starts with the letter G

In [64]:
# Q14: how many unique cities and countries in the dataset? 
# Also can you check missing values for the dataframe

In [65]:
# Q15: group by country name and show the city names in each of the country

## Part 3 <br /> How to select based on a date-time values

In [67]:
# load the location data set, indexed by the date field
# and, parse the dates so they're no longer strings but now rather Python datetime objects
# this lets us do date and time based operations on the data set
dt = pd.read_csv('data/summer-travel-gps-full.csv', index_col='date', parse_dates=True)
dt.head()

Unnamed: 0_level_0,lat,lon,city,country
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-05-14 09:07:00,51.481292,-0.451011,West Drayton,United Kingdom
2014-05-14 09:22:00,51.474005,-0.450999,Hounslow,United Kingdom
2014-05-14 10:51:00,51.478199,-0.446081,Hounslow,United Kingdom
2014-05-14 11:24:00,51.478199,-0.446081,Hounslow,United Kingdom
2014-05-14 11:38:00,51.474146,-0.451562,Hounslow,United Kingdom


In [70]:
# Q16: is the timestamp index unique? How can you use code to find it? 

In [71]:
# Q17: drop duplicate index 

In [72]:
# Q18: create a weekday and a weekend dataframe

In [36]:
# Q19: calculate and plot the number of observations each day of the week has