In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt

### Part 1 <br /> Basics of Selecting and Slicing Data

In [5]:
df = pd.read_csv('data/summer-travel-gps-full.csv')
df.head()

Unnamed: 0,lat,lon,date,city,country
0,51.481292,-0.451011,05/14/2014 09:07,West Drayton,United Kingdom
1,51.474005,-0.450999,05/14/2014 09:22,Hounslow,United Kingdom
2,51.478199,-0.446081,05/14/2014 10:51,Hounslow,United Kingdom
3,51.478199,-0.446081,05/14/2014 11:24,Hounslow,United Kingdom
4,51.474146,-0.451562,05/14/2014 11:38,Hounslow,United Kingdom


#### Q1: how to get 2 columns from the dataframe (city and country)?

In [11]:
df_city_country = df[['city','country']]
print(df_city_country.head())

           city         country
0  West Drayton  United Kingdom
1      Hounslow  United Kingdom
2      Hounslow  United Kingdom
3      Hounslow  United Kingdom
4      Hounslow  United Kingdom


#### Q2: how to get the first 5 rows of the "city" column?

In [21]:
df_city = df[['city']]
print(df_city.head())

           city
0  West Drayton
1      Hounslow
2      Hounslow
3      Hounslow
4      Hounslow


#### Q3: how to use .loc to select the third row of the dataframe?

In [22]:
df.loc[2:2]

Unnamed: 0,lat,lon,date,city,country
2,51.478199,-0.446081,05/14/2014 10:51,Hounslow,United Kingdom


#### Q4: how to use .loc to select the first row in "country" column?

In [24]:
df.loc[2,'country']

'United Kingdom'

#### Q5: how to select the first 4 rows of ['city', 'date'] columns?

In [27]:
df.loc[:4,'date':'city']

Unnamed: 0,date,city
0,05/14/2014 09:07,West Drayton
1,05/14/2014 09:22,Hounslow
2,05/14/2014 10:51,Hounslow
3,05/14/2014 11:24,Hounslow
4,05/14/2014 11:38,Hounslow


#### Q6: how to get the value from the row in position 3 and the column in position 2

In [28]:
df.iloc[2,1]

-0.446081

#### Q7: how to use iloc to select every 300th row from a data set

In [52]:
df.iloc[::300, :]

Unnamed: 0,lat,lon,date,city,country
0,51.481292,-0.451011,05/14/2014 09:07,West Drayton,United Kingdom
300,41.377091,2.151175,05/20/2014 03:18,Barcelona,Spain
600,50.052338,19.94622,05/31/2014 21:10,Krakow,Poland
900,48.561181,9.059672,06/09/2014 15:12,Tübingen,Germany
1200,41.378301,2.187443,06/17/2014 16:37,Barcelona,Spain
1500,42.208201,20.735993,06/30/2014 08:27,Prizren,Kosovo


### Part 2 <br /> How to select rows by some value(s)

In [31]:
df2 = pd.read_csv('data/summer-travel-gps-simplified.csv')
df2.tail()

Unnamed: 0,lat,lon,date,city,country
173,41.044556,28.983286,07/08/2014 16:44,Istanbul,Turkey
174,41.008992,28.968268,07/08/2014 20:03,Istanbul,Turkey
175,41.043487,28.985488,07/08/2014 22:18,Istanbul,Turkey
176,40.977637,28.823879,07/09/2014 09:03,Istanbul,Turkey
177,48.35711,11.791346,07/09/2014 13:20,Munich,Germany


#### Q9: create a Series of true/false, indicating if each "city" row in the column is equal to "Munich"

In [32]:
df2['city'] == 'Munich'

0      False
1      False
2      False
3      False
4      False
       ...  
173    False
174    False
175    False
176    False
177     True
Name: city, Length: 178, dtype: bool

#### Q10: what cities were visited in spain that were not barcelona? Create a dataframe for it. 

In [45]:
# pandas logical operators are: | for or, & for and, ~ for not
# these must be grouped by using parentheses

df_spain = df2[(df2['country'] == 'Spain') & (df2['city'] != 'Barcelona')]
print(df_spain)


           lat       lon              date                     city country
24   41.303911  2.105931  05/18/2014 22:35     El Prat de Llobregat   Spain
25   41.289946  2.064590  05/18/2014 23:04               Viladecans   Spain
126  41.306752  2.097624  06/12/2014 17:19     El Prat de Llobregat   Spain
127  41.304333  2.072728  06/12/2014 17:49     El Prat de Llobregat   Spain
131  41.358460  2.128701  06/13/2014 11:35  Hospitalet de Llobregat   Spain
138  41.294761  2.059722  06/20/2014 22:15               Viladecans   Spain


#### Q11: select rows where either the city is munich, or the country is serbia

In [46]:
df_munich_serbia = df2[(df2['city'] == 'Munich') | (df2['country'] == 'Serbia')]
print(df_munich_serbia)

           lat        lon              date       city  country
139  44.821164  20.289821  06/21/2014 01:59   Belgrade   Serbia
140  44.820414  20.463465  06/21/2014 18:44   Belgrade   Serbia
141  44.761583  19.577904  06/22/2014 07:58  Slepčević   Serbia
177  48.357110  11.791346  07/09/2014 13:20     Munich  Germany


#### Q12: how many observations are west of the prime meridian?

In [51]:
df_west = df2[df2['lon'] < 0]
print(df_west)
len(df_west)
#24observations are west of prime meridian. 

          lat       lon              date                   city  \
0   51.481292 -0.451011  05/14/2014 09:07           West Drayton   
1   38.781775 -9.137544  05/14/2014 15:11                 Lisbon   
2   38.711050 -9.139739  05/14/2014 16:40                 Lisbon   
3   38.715637 -9.120558  05/14/2014 18:25                 Lisbon   
4   38.711977 -9.141788  05/14/2014 19:26                 Lisbon   
5   38.723108 -9.113374  05/15/2014 11:34                 Lisbon   
6   38.691944 -9.215561  05/15/2014 14:50                 Lisbon   
7   38.708487 -9.136022  05/15/2014 16:49                 Lisbon   
8   38.797558 -9.341000  05/16/2014 10:58  Algueirão-Mem Martins   
9   38.801171 -9.425103  05/16/2014 13:43                 Sintra   
10  38.793071 -9.285825  05/16/2014 15:13                 Sintra   
11  38.816611 -9.408500  05/16/2014 16:13                 Sintra   
12  38.788205 -9.388018  05/16/2014 16:28                 Sintra   
13  38.711024 -9.137398  05/16/2014 23:24       

24

#### Q13: get all rows that contain a city that starts with the letter G

In [57]:
df_g = df2[df2['city'].str.startswith('G')]
df_g

Unnamed: 0,lat,lon,date,city,country
62,50.273632,18.729429,06/02/2014 06:39,Gliwice,Poland
114,48.28294,8.19963,06/10/2014 13:33,Gutach,Germany
115,48.389029,8.021342,06/10/2014 13:48,Gengenbach,Germany
152,40.187825,20.079303,07/04/2014 17:42,Gjirokastër,Albania


#### Q14: how many unique cities and countries in the dataset?  Also can you check missing values for the dataframe


In [85]:
len(df2['country'].unique())
#There are 15 countries

array(['United Kingdom', 'Portugal', 'Spain', 'Poland', 'Czech Republic',
       'Germany', 'Serbia', 'Bosnia and Herzegovina', 'Croatia',
       'Montenegro', 'Kosovo', 'Macedonia (FYROM)', 'Albania', 'Greece',
       'Turkey'], dtype=object)

In [64]:
len(df2['city'].unique())
#There are 91 cities

91

In [66]:
df2.isna().any()
#There is no missing values of the dataframe

lat        False
lon        False
date       False
city       False
country    False
dtype: bool

#### Q15: group by country name and show the city names in each of the country

In [83]:
dfgroup = df2.groupby('country')['city'].unique()

In [84]:
print(dfgroup)

country
Albania                                                [Berat, Gjirokastër]
Bosnia and Herzegovina                                   [Sarajevo, Mostar]
Croatia                                           [Ploče, Split, Dubrovnik]
Czech Republic            [Novy Bohumin, Hranice, Prerov, Ústí nad Orlic...
Germany                   [Kümmersbruck, Winkelhaid, Kammerstein, Ellhof...
Greece                    [Kakavia, Dytiki Ellada, Peloponnese, Athens, ...
Kosovo                                                            [Prizren]
Macedonia (FYROM)                                                   [Ohrid]
Montenegro                                                          [Kotor]
Poland                    [Zendek, Silesian Voivodeship, Dabrowa Gornicz...
Portugal                  [Lisbon, Algueirão-Mem Martins, Sintra, Azambu...
Serbia                                                [Belgrade, Slepčević]
Spain                     [El Prat de Llobregat, Viladecans, Barcelona, ...
Turk

### Part 3 <br /> How to select based on a date-time values

In [129]:
# load the location data set, indexed by the date field
# and, parse the dates so they're no longer strings but now rather Python datetime objects
# this lets us do date and time based operations on the data set
df3 = pd.read_csv('data/summer-travel-gps-full.csv', index_col='date', parse_dates=True)
df3.head()

Unnamed: 0_level_0,lat,lon,city,country
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-05-14 09:07:00,51.481292,-0.451011,West Drayton,United Kingdom
2014-05-14 09:22:00,51.474005,-0.450999,Hounslow,United Kingdom
2014-05-14 10:51:00,51.478199,-0.446081,Hounslow,United Kingdom
2014-05-14 11:24:00,51.478199,-0.446081,Hounslow,United Kingdom
2014-05-14 11:38:00,51.474146,-0.451562,Hounslow,United Kingdom


In [130]:
df3

Unnamed: 0_level_0,lat,lon,city,country
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-05-14 09:07:00,51.481292,-0.451011,West Drayton,United Kingdom
2014-05-14 09:22:00,51.474005,-0.450999,Hounslow,United Kingdom
2014-05-14 10:51:00,51.478199,-0.446081,Hounslow,United Kingdom
2014-05-14 11:24:00,51.478199,-0.446081,Hounslow,United Kingdom
2014-05-14 11:38:00,51.474146,-0.451562,Hounslow,United Kingdom
...,...,...,...,...
2014-07-09 13:13:00,48.356013,11.791710,Munich,Germany
2014-07-09 13:14:00,48.356529,11.792183,Munich,Germany
2014-07-09 13:17:00,48.356285,11.791710,Munich,Germany
2014-07-09 13:18:00,48.355328,11.791710,Munich,Germany


#### Q16: is the timestamp index unique? How can you use code to find it? 

In [140]:
df3.index.value_counts()
#timestamp index is not unique.  Data on 07/05/2014 00:52 in Greece appears 2 times.

2014-07-05 00:52:00    2
2014-06-17 00:52:00    1
2014-06-17 06:07:00    1
2014-06-17 05:22:00    1
2014-06-17 04:52:00    1
                      ..
2014-05-31 14:09:00    1
2014-05-31 13:55:00    1
2014-05-31 13:40:00    1
2014-05-31 13:25:00    1
2014-07-09 13:20:00    1
Name: date, Length: 1758, dtype: int64

#### Q17: drop duplicate index 

In [146]:
df3.index.duplicated()

array([False, False, False, ..., False, False, False])

In [153]:
df3 = df3[~df3.index.duplicated()]

In [154]:
df3

Unnamed: 0_level_0,lat,lon,city,country
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-05-14 09:07:00,51.481292,-0.451011,West Drayton,United Kingdom
2014-05-14 09:22:00,51.474005,-0.450999,Hounslow,United Kingdom
2014-05-14 10:51:00,51.478199,-0.446081,Hounslow,United Kingdom
2014-05-14 11:24:00,51.478199,-0.446081,Hounslow,United Kingdom
2014-05-14 11:38:00,51.474146,-0.451562,Hounslow,United Kingdom
...,...,...,...,...
2014-07-09 13:13:00,48.356013,11.791710,Munich,Germany
2014-07-09 13:14:00,48.356529,11.792183,Munich,Germany
2014-07-09 13:17:00,48.356285,11.791710,Munich,Germany
2014-07-09 13:18:00,48.355328,11.791710,Munich,Germany


#### Q18: create a weekday and a weekend dataframe

In [169]:
df3_weekend = df3[df3.index.dayofweek > 4] #'date' already is DatetimeIndex which can directly call .dayofweek

In [170]:
df3_weekdays = df3[df3.index.dayofweek <= 4]

In [171]:
df3_weekend

Unnamed: 0_level_0,lat,lon,city,country,day_of_week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-05-17 00:09:00,38.711824,-9.137308,Lisbon,Portugal,2014-05-17 00:09:00
2014-05-17 00:39:00,38.711833,-9.137317,Lisbon,Portugal,2014-05-17 00:39:00
2014-05-17 01:09:00,38.711822,-9.137277,Lisbon,Portugal,2014-05-17 01:09:00
2014-05-17 01:39:00,38.711823,-9.137273,Lisbon,Portugal,2014-05-17 01:39:00
2014-05-17 01:54:00,38.711828,-9.137279,Lisbon,Portugal,2014-05-17 01:54:00
...,...,...,...,...,...
2014-07-06 22:12:00,41.040212,28.988639,Istanbul,Turkey,2014-07-06 22:12:00
2014-07-06 22:27:00,41.043491,28.985364,Istanbul,Turkey,2014-07-06 22:27:00
2014-07-06 23:12:00,41.043476,28.985419,Istanbul,Turkey,2014-07-06 23:12:00
2014-07-06 23:23:00,41.043494,28.985460,Istanbul,Turkey,2014-07-06 23:23:00


In [172]:
df3_weekdays

Unnamed: 0_level_0,lat,lon,city,country,day_of_week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-05-14 09:07:00,51.481292,-0.451011,West Drayton,United Kingdom,2014-05-14 09:07:00
2014-05-14 09:22:00,51.474005,-0.450999,Hounslow,United Kingdom,2014-05-14 09:22:00
2014-05-14 10:51:00,51.478199,-0.446081,Hounslow,United Kingdom,2014-05-14 10:51:00
2014-05-14 11:24:00,51.478199,-0.446081,Hounslow,United Kingdom,2014-05-14 11:24:00
2014-05-14 11:38:00,51.474146,-0.451562,Hounslow,United Kingdom,2014-05-14 11:38:00
...,...,...,...,...,...
2014-07-09 13:13:00,48.356013,11.791710,Munich,Germany,2014-07-09 13:13:00
2014-07-09 13:14:00,48.356529,11.792183,Munich,Germany,2014-07-09 13:14:00
2014-07-09 13:17:00,48.356285,11.791710,Munich,Germany,2014-07-09 13:17:00
2014-07-09 13:18:00,48.355328,11.791710,Munich,Germany,2014-07-09 13:18:00
