# Lab05 Python Data Science with Pandas

## Please submit your finished lab05 jupyter notebook via pull request in GitHub

In [286]:
# magic command to display matplotlib plots inline within the ipython notebook webpage
%matplotlib inline

# import necessary modules
import pandas as pd, numpy as np, matplotlib.pyplot as plt

## Part 1 <br /> Basics of Selecting and Slicing Data

In [287]:
# create a pandas dataframe from the location data set
df = pd.read_csv('summer-travel-gps-full.csv')
df.head()


Unnamed: 0,lat,lon,date,city,country
0,51.481292,-0.451011,05/14/2014 09:07,West Drayton,United Kingdom
1,51.474005,-0.450999,05/14/2014 09:22,Hounslow,United Kingdom
2,51.478199,-0.446081,05/14/2014 10:51,Hounslow,United Kingdom
3,51.478199,-0.446081,05/14/2014 11:24,Hounslow,United Kingdom
4,51.474146,-0.451562,05/14/2014 11:38,Hounslow,United Kingdom


In [288]:
# Q1: how to get 2 columns from the dataframe (city and country)?

To get a single "cell's" value out of a dataframe, pass a column name, then a row label. This is equivalent to slicing the dataframe down to a single series, then slicing a single value out of that series using [ ] indexing.

To get 2 columns from the dataframe, we just use names within [[]].

In [289]:
df[['city','country']]

Unnamed: 0,city,country
0,West Drayton,United Kingdom
1,Hounslow,United Kingdom
2,Hounslow,United Kingdom
3,Hounslow,United Kingdom
4,Hounslow,United Kingdom
...,...,...
1754,Munich,Germany
1755,Munich,Germany
1756,Munich,Germany
1757,Munich,Germany


In [290]:
# Q2: how to get the first 5 rows of the "city" column?

# A2: We slice!

df[0:5]

Unnamed: 0,lat,lon,date,city,country
0,51.481292,-0.451011,05/14/2014 09:07,West Drayton,United Kingdom
1,51.474005,-0.450999,05/14/2014 09:22,Hounslow,United Kingdom
2,51.478199,-0.446081,05/14/2014 10:51,Hounslow,United Kingdom
3,51.478199,-0.446081,05/14/2014 11:24,Hounslow,United Kingdom
4,51.474146,-0.451562,05/14/2014 11:38,Hounslow,United Kingdom


### Using .loc[ ]

In [291]:
# Q3: how to use .loc to select the third row of the dataframe?

# A3: .loc can be used by calling the row number, making sure to remember that the names start at 0!

df.loc[[2]]



Unnamed: 0,lat,lon,date,city,country
2,51.478199,-0.446081,05/14/2014 10:51,Hounslow,United Kingdom


In [292]:
# Q4: how to use .loc to select the first row in "country" column?

# A4: By calling both the row name and the column name.

df.loc[0, "country"]

'United Kingdom'

In [293]:
# Q5: how to select the first 4 rows of ['city', 'date'] columns?

# A5: By slicing rows 0 to 3 and selecting city and date columns

df.loc[0:3, ['city','date']]

Unnamed: 0,city,date
0,West Drayton,05/14/2014 09:07
1,Hounslow,05/14/2014 09:22
2,Hounslow,05/14/2014 10:51
3,Hounslow,05/14/2014 11:24


### Using .iloc[ ]

In [294]:
# use .iloc for integer position based indexing
# Q6: how to get the value from the row in position 3 and the column in position 2

# A6: By slicing the row in position 3 and then selecting the 2nd column.

df.iloc[3,2]

'05/14/2014 11:24'

In [295]:
# Q7: how to use iloc to select every 300th row from a data set

# A7: 

df.iloc[300:1758:300]

Unnamed: 0,lat,lon,date,city,country
300,41.377091,2.151175,05/20/2014 03:18,Barcelona,Spain
600,50.052338,19.94622,05/31/2014 21:10,Krakow,Poland
900,48.561181,9.059672,06/09/2014 15:12,Tübingen,Germany
1200,41.378301,2.187443,06/17/2014 16:37,Barcelona,Spain
1500,42.208201,20.735993,06/30/2014 08:27,Prizren,Kosovo


## Part 2 <br /> How to select rows by some value(s)

In [296]:
# load a reduced set of gps data
df = pd.read_csv('summer-travel-gps-simplified.csv')
df.tail()

Unnamed: 0,lat,lon,date,city,country
173,41.044556,28.983286,07/08/2014 16:44,Istanbul,Turkey
174,41.008992,28.968268,07/08/2014 20:03,Istanbul,Turkey
175,41.043487,28.985488,07/08/2014 22:18,Istanbul,Turkey
176,40.977637,28.823879,07/09/2014 09:03,Istanbul,Turkey
177,48.35711,11.791346,07/09/2014 13:20,Munich,Germany


In [297]:
# Q9: create a Series of true/false, indicating if each "city" row in the column is equal to "Munich"

# A9: 

df.loc[:,["city"]] == "Munich"

Unnamed: 0,city
0,False
1,False
2,False
3,False
4,False
...,...
173,False
174,False
175,False
176,False


In [298]:
# pandas logical operators are: | for or, & for and, ~ for not
# these must be grouped by using parentheses
# Q10: what cities were visited in spain that were not barcelona? Create a dataframe for it. 

# A10: 

#cities_visited = pd.DataFrame({(df.loc[:,["country"]] == "Spain") & (df.loc[:,["city"]]  != "Barcelona"})
#=df.loc[:,[["city"] != "Barcelona"]]
cities_visited = df[(df["country"] == "Spain") & (df["city"]  != "Barcelona")]

print(cities_visited)

           lat       lon              date                     city country
24   41.303911  2.105931  05/18/2014 22:35     El Prat de Llobregat   Spain
25   41.289946  2.064590  05/18/2014 23:04               Viladecans   Spain
126  41.306752  2.097624  06/12/2014 17:19     El Prat de Llobregat   Spain
127  41.304333  2.072728  06/12/2014 17:49     El Prat de Llobregat   Spain
131  41.358460  2.128701  06/13/2014 11:35  Hospitalet de Llobregat   Spain
138  41.294761  2.059722  06/20/2014 22:15               Viladecans   Spain


In [299]:
# Q11: select rows where either the city is munich, or the country is serbia

# A11:

df[(df["city"] == "Munich") | (df["country"] == "Serbia")]


Unnamed: 0,lat,lon,date,city,country
139,44.821164,20.289821,06/21/2014 01:59,Belgrade,Serbia
140,44.820414,20.463465,06/21/2014 18:44,Belgrade,Serbia
141,44.761583,19.577904,06/22/2014 07:58,Slepčević,Serbia
177,48.35711,11.791346,07/09/2014 13:20,Munich,Germany


In [300]:
# Q12: how many observations are west of the prime meridian?

# A12: 

west = df[df["lon"] < 0].shape[0]
print("There were", west, "observations west of the prime meridian.")

There were 24 observations west of the prime meridian.


In [453]:
# Q13: get all rows that contain a city that starts with the letter G

# A13: 
df[df["city"].str.startswith("G")]


Unnamed: 0,lat,lon,date,city,country
62,50.273632,18.729429,06/02/2014 06:39,Gliwice,Poland


In [302]:
# Q14: how many unique cities and countries in the dataset? 
# Also can you check missing values for the dataframe

print("There are", len(df.city.unique()), "unique cities.")
print("There are", len(df.country.unique()), "unique countries.")

df.isnull().sum()


There are 91 unique cities.
There are 15 unique countries.


lat        0
lon        0
date       0
city       0
country    0
dtype: int64

In [303]:
# Q15: group by country name and show the city names in each of the country

# A15:

df.groupby("country")["city"].unique()


country
Albania                                                [Berat, Gjirokastër]
Bosnia and Herzegovina                                   [Sarajevo, Mostar]
Croatia                                           [Ploče, Split, Dubrovnik]
Czech Republic            [Novy Bohumin, Hranice, Prerov, Ústí nad Orlic...
Germany                   [Kümmersbruck, Winkelhaid, Kammerstein, Ellhof...
Greece                    [Kakavia, Dytiki Ellada, Peloponnese, Athens, ...
Kosovo                                                            [Prizren]
Macedonia (FYROM)                                                   [Ohrid]
Montenegro                                                          [Kotor]
Poland                    [Zendek, Silesian Voivodeship, Dabrowa Gornicz...
Portugal                  [Lisbon, Algueirão-Mem Martins, Sintra, Azambu...
Serbia                                                [Belgrade, Slepčević]
Spain                     [El Prat de Llobregat, Viladecans, Barcelona, ...
Turk

## Part 3 <br /> How to select based on a date-time values

In [521]:
# load the location data set, indexed by the date field
# and, parse the dates so they're no longer strings but now rather Python datetime objects
# this lets us do date and time based operations on the data set
dt = pd.read_csv('summer-travel-gps-full.csv', index_col='date', parse_dates=True)
dt.head()

Unnamed: 0_level_0,lat,lon,city,country
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-05-14 09:07:00,51.481292,-0.451011,West Drayton,United Kingdom
2014-05-14 09:22:00,51.474005,-0.450999,Hounslow,United Kingdom
2014-05-14 10:51:00,51.478199,-0.446081,Hounslow,United Kingdom
2014-05-14 11:24:00,51.478199,-0.446081,Hounslow,United Kingdom
2014-05-14 11:38:00,51.474146,-0.451562,Hounslow,United Kingdom


In [522]:
dt = pd.read_csv('summer-travel-gps-full.csv', index_col='date', parse_dates= True)

dt.index


DatetimeIndex(['2014-05-14 09:07:00', '2014-05-14 09:22:00',
               '2014-05-14 10:51:00', '2014-05-14 11:24:00',
               '2014-05-14 11:38:00', '2014-05-14 12:23:00',
               '2014-05-14 12:38:00', '2014-05-14 15:11:00',
               '2014-05-14 15:25:00', '2014-05-14 15:41:00',
               ...
               '2014-07-09 06:03:00', '2014-07-09 06:18:00',
               '2014-07-09 09:03:00', '2014-07-09 09:18:00',
               '2014-07-09 13:11:00', '2014-07-09 13:13:00',
               '2014-07-09 13:14:00', '2014-07-09 13:17:00',
               '2014-07-09 13:18:00', '2014-07-09 13:20:00'],
              dtype='datetime64[ns]', name='date', length=1759, freq=None)

In [523]:
# Q16: is the timestamp index unique? How can you use code to find it? 

duplicates= dt.index.duplicated()
dt[duplicates == True]

# A16: There is one duplicate timestamp entry.


Unnamed: 0_level_0,lat,lon,city,country
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-07-05 00:52:00,38.541479,21.284091,Dytiki Ellada,Greece


In [532]:
# Q17: drop duplicate index 

print(dt.shape)

#duplicates= dt.index.duplicated()

dt_unique = dt.drop(index='2014-07-05 00:52:00')

dt_unique


(1759, 4)


Unnamed: 0_level_0,lat,lon,city,country
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-05-14 09:07:00,51.481292,-0.451011,West Drayton,United Kingdom
2014-05-14 09:22:00,51.474005,-0.450999,Hounslow,United Kingdom
2014-05-14 10:51:00,51.478199,-0.446081,Hounslow,United Kingdom
2014-05-14 11:24:00,51.478199,-0.446081,Hounslow,United Kingdom
2014-05-14 11:38:00,51.474146,-0.451562,Hounslow,United Kingdom
...,...,...,...,...
2014-07-09 13:13:00,48.356013,11.791710,Munich,Germany
2014-07-09 13:14:00,48.356529,11.792183,Munich,Germany
2014-07-09 13:17:00,48.356285,11.791710,Munich,Germany
2014-07-09 13:18:00,48.355328,11.791710,Munich,Germany


In [543]:
# Q18: create a weekday and a weekend dataframe

dt['weekday'] = dt.index.isocalendar().day

# A18: Here are the dataframes:

work_day = dt[dt['weekday'] <= 5]  #weekday dataframe with 1245 rows × 6 columns

weekend = dt[dt['weekday'] > 5] #weekend dataframe with 514 rows × 6 columns


In [556]:
# Q19: calculate and plot the number of observations each day of the week has

print("A19 - Number of observations on Mondays:",dt['weekday'].value_counts()[1])
print("Number of observations on Tuesdays:",dt['weekday'].value_counts()[2])
print("Number of observations on Wednesdays:",dt['weekday'].value_counts()[3])
print("Number of observations on Thursdays:",dt['weekday'].value_counts()[4])
print("Number of observations on Fridays:",dt['weekday'].value_counts()[5])
print("Number of observations on Saturdays:",dt['weekday'].value_counts()[6])
print("Number of observations on Sundays:",dt['weekday'].value_counts()[7])

A19 - Number of observations on Mondays: 263
Number of observations on Tuesdays: 254
Number of observations on Wednesdays: 254
Number of observations on Thursdays: 256
Number of observations on Fridays: 218
Number of observations on Saturdays: 269
Number of observations on Sundays: 245
