# Lab05 Python Data Science with Pandas

## Please submit your finished lab05 jupyter notebook via pull request in GitHub

In [1]:
# magic command to display matplotlib plots inline within the ipython notebook webpage
%matplotlib inline

# import necessary modules
import pandas as pd, numpy as np, matplotlib.pyplot as plt

## Part 1 <br /> Basics of Selecting and Slicing Data

In [3]:
# create a pandas dataframe from the location data set
df = pd.read_csv('data/summer-travel-gps-full.csv')
df.head()

Unnamed: 0,lat,lon,date,city,country
0,51.481292,-0.451011,05/14/2014 09:07,West Drayton,United Kingdom
1,51.474005,-0.450999,05/14/2014 09:22,Hounslow,United Kingdom
2,51.478199,-0.446081,05/14/2014 10:51,Hounslow,United Kingdom
3,51.478199,-0.446081,05/14/2014 11:24,Hounslow,United Kingdom
4,51.474146,-0.451562,05/14/2014 11:38,Hounslow,United Kingdom


In [6]:
# Q1: how to get 2 columns from the dataframe (city and country)?
df[['city', 'country']]

Unnamed: 0,city,country
0,West Drayton,United Kingdom
1,Hounslow,United Kingdom
2,Hounslow,United Kingdom
3,Hounslow,United Kingdom
4,Hounslow,United Kingdom
...,...,...
1754,Munich,Germany
1755,Munich,Germany
1756,Munich,Germany
1757,Munich,Germany


To get a single "cell's" value out of a dataframe, pass a column name, then a row label. This is equivalent to slicing the dataframe down to a single series, then slicing a single value out of that series using [ ] indexing.

In [7]:
# Q2: how to get the first 5 rows of the "city" column?
df['city'][:5]

0    West Drayton
1        Hounslow
2        Hounslow
3        Hounslow
4        Hounslow
Name: city, dtype: object

### Using .loc[ ]

In [10]:
# Q3: how to use .loc to select the third row of the dataframe?
df.loc[2]

lat               51.478199
lon               -0.446081
date       05/14/2014 11:24
city               Hounslow
country      United Kingdom
Name: 3, dtype: object

In [11]:
# Q4: how to use .loc to select the first row in "country" column?
df.loc[0, 'country']

'United Kingdom'

In [12]:
# Q5: how to select the first 4 rows of ['city', 'date'] columns?
df.loc[:4, ['city', 'date']]

Unnamed: 0,city,date
0,West Drayton,05/14/2014 09:07
1,Hounslow,05/14/2014 09:22
2,Hounslow,05/14/2014 10:51
3,Hounslow,05/14/2014 11:24
4,Hounslow,05/14/2014 11:38


### Using .iloc[ ]

In [13]:
# use .iloc for integer position based indexing
# Q6: how to get the value from the row in position 3 and the column in position 2
df.iloc[3][2]

'05/14/2014 11:24'

In [14]:
# Q7: how to use iloc to select every 300th row from a data set
df.iloc[::300]

Unnamed: 0,lat,lon,date,city,country
0,51.481292,-0.451011,05/14/2014 09:07,West Drayton,United Kingdom
300,41.377091,2.151175,05/20/2014 03:18,Barcelona,Spain
600,50.052338,19.94622,05/31/2014 21:10,Krakow,Poland
900,48.561181,9.059672,06/09/2014 15:12,Tübingen,Germany
1200,41.378301,2.187443,06/17/2014 16:37,Barcelona,Spain
1500,42.208201,20.735993,06/30/2014 08:27,Prizren,Kosovo


## Part 2 <br /> How to select rows by some value(s)

In [15]:
# load a reduced set of gps data
df = pd.read_csv('data/summer-travel-gps-simplified.csv')
df.tail()

Unnamed: 0,lat,lon,date,city,country
173,41.044556,28.983286,07/08/2014 16:44,Istanbul,Turkey
174,41.008992,28.968268,07/08/2014 20:03,Istanbul,Turkey
175,41.043487,28.985488,07/08/2014 22:18,Istanbul,Turkey
176,40.977637,28.823879,07/09/2014 09:03,Istanbul,Turkey
177,48.35711,11.791346,07/09/2014 13:20,Munich,Germany


In [21]:
# Q9: create a Series of true/false, indicating if each "city" row in the column is equal to "Munich"
df['is_munich'] = [city == "Munich" for city in df['city']]
df

Unnamed: 0,lat,lon,date,city,country,is_munich
0,51.481292,-0.451011,05/14/2014 09:07,West Drayton,United Kingdom,False
1,38.781775,-9.137544,05/14/2014 15:11,Lisbon,Portugal,False
2,38.711050,-9.139739,05/14/2014 16:40,Lisbon,Portugal,False
3,38.715637,-9.120558,05/14/2014 18:25,Lisbon,Portugal,False
4,38.711977,-9.141788,05/14/2014 19:26,Lisbon,Portugal,False
...,...,...,...,...,...,...
173,41.044556,28.983286,07/08/2014 16:44,Istanbul,Turkey,False
174,41.008992,28.968268,07/08/2014 20:03,Istanbul,Turkey,False
175,41.043487,28.985488,07/08/2014 22:18,Istanbul,Turkey,False
176,40.977637,28.823879,07/09/2014 09:03,Istanbul,Turkey,False


In [None]:
# pandas logical operators are: | for or, & for and, ~ for not
# these must be grouped by using parentheses
# Q10: what cities were visited in spain that were not barcelona? Create a dataframe for it. 
df_not_barcelona

In [None]:
# Q11: select rows where either the city is munich, or the country is serbia

In [None]:
# Q12: how many observations are west of the prime meridian?

In [None]:
# Q13: get all rows that contain a city that starts with the letter G

In [None]:
# Q14: how many unique cities and countries in the dataset? 
# Also can you check missing values for the dataframe

In [None]:
# Q15: group by country name and show the city names in each of the country

## Part 3 <br /> How to select based on a date-time values

In [None]:
# load the location data set, indexed by the date field
# and, parse the dates so they're no longer strings but now rather Python datetime objects
# this lets us do date and time based operations on the data set
dt = pd.read_csv('data/summer-travel-gps-full.csv', index_col='date', parse_dates=True)
dt.head()

Unnamed: 0_level_0,lat,lon,city,country
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-05-14 09:07:00,51.481292,-0.451011,West Drayton,United Kingdom
2014-05-14 09:22:00,51.474005,-0.450999,Hounslow,United Kingdom
2014-05-14 10:51:00,51.478199,-0.446081,Hounslow,United Kingdom
2014-05-14 11:24:00,51.478199,-0.446081,Hounslow,United Kingdom
2014-05-14 11:38:00,51.474146,-0.451562,Hounslow,United Kingdom


In [None]:
# Q16: is the timestamp index unique? How can you use code to find it? 

In [None]:
# Q17: drop duplicate index 

In [None]:
# Q18: create a weekday and a weekend dataframe

In [None]:
# Q19: calculate and plot the number of observations each day of the week has