In [1]:
# Data Science Course Week 1 - Data Manipulation using Pandas

## Here we will be using real data sources to explore the features of Pandas


In [2]:
import pandas as pd

In [6]:
1==5

False

# Class Workshop

## Numpy Overview

* Why Python for Data? Numpy brings *decades* of C math into Python!
* Numpy provides a wrapper for extensive C/C++/Fortran codebases, used for data analysis functionality
* NDAarray allows easy vectorized math and broadcasting (i.e. functions for vector elements of different shapes)

In [9]:
import numpy as np

### Creating ndarrays
An array object represents a multidimensional, homogeneous array of fixed-size items.

In [12]:
a = np.array( [20,30,40,50] )
b = np.arange( 4 )
a,b

(array([20, 30, 40, 50]), array([0, 1, 2, 3]))

In [17]:
help(np.arange)

Help on built-in function arange in module numpy.core.multiarray:

arange(...)
    arange([start,] stop[, step,], dtype=None)
    
    Return evenly spaced values within a given interval.
    
    Values are generated within the half-open interval ``[start, stop)``
    (in other words, the interval including `start` but excluding `stop`).
    For integer arguments the function is equivalent to the Python built-in
    `range <http://docs.python.org/lib/built-in-funcs.html>`_ function,
    but returns an ndarray rather than a list.
    
    When using a non-integer step, such as 0.1, the results will often not
    be consistent.  It is better to use ``linspace`` for these cases.
    
    Parameters
    ----------
    start : number, optional
        Start of interval.  The interval includes this value.  The default
        start value is 0.
    stop : number
        End of interval.  The interval does not include this value, except
        in some cases where `step` is not an integer and

In [20]:

c = a-b
a,b,c

(array([20, 30, 40, 50]), array([0, 1, 2, 3]), array([20, 29, 38, 47]))

In [8]:
b**2

array([0, 1, 4, 9])

In [21]:
# Creating arrays
a = np.zeros((3))
b = np.ones((2,3))
c = np.random.randint(1,10,(2,3,4))
help(np.random.randint)

Help on built-in function randint:

randint(...)
    randint(low, high=None, size=None)
    
    Return random integers from `low` (inclusive) to `high` (exclusive).
    
    Return random integers from the "discrete uniform" distribution in the
    "half-open" interval [`low`, `high`). If `high` is None (the default),
    then results are from [0, `low`).
    
    Parameters
    ----------
    low : int
        Lowest (signed) integer to be drawn from the distribution (unless
        ``high=None``, in which case this parameter is the *highest* such
        integer).
    high : int, optional
        If provided, one above the largest (signed) integer to be drawn
        from the distribution (see above for behavior if ``high=None``).
    size : int or tuple of ints, optional
        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
        ``m * n * k`` samples are drawn.  Default is None, in which case a
        single value is returned.
    
    Returns
    -------
    

In [10]:
a

array([ 0.,  0.,  0.])

In [11]:
b

array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

In [12]:
c

array([[[1, 4, 2, 7],
        [3, 2, 8, 4],
        [4, 1, 9, 4]],

       [[8, 5, 7, 3],
        [7, 8, 3, 9],
        [7, 6, 2, 6]]])

## Indexing, Slicing and Iterating

In [22]:
# one-dimensional arrays work like lists:
a = np.arange(10)**2

In [23]:
a

array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81])

In [24]:
a[2:5]

array([ 4,  9, 16])

In [25]:
a[0:3]

array([0, 1, 4])

In [26]:
a[1:3]

array([1, 4])

### Reading Files, Selecting Columns, and Summarizing

MovieLens 100k movie rating data:
    main page: http://grouplens.org/datasets/movielens/
    data dictionary: http://files.grouplens.org/datasets/movielens/ml-100k-README.txt
    files: u.user, u.data, u.item

In [27]:
# can read a file from local computer or directly from a URL
pd.read_table('u.user', header=None)

Unnamed: 0,0
0,1|24|M|technician|85711
1,2|53|F|other|94043
2,3|23|M|writer|32067
3,4|24|M|technician|43537
4,5|33|F|other|15213
5,6|42|M|executive|98101
6,7|57|M|administrator|91344
7,8|36|M|administrator|05201
8,9|29|M|student|01002
9,10|53|M|lawyer|90703


Now having seen the data, read it in again but now with the pipe separator/delimiter.

In [28]:
# read 'u.user' and asign to 'users' so that we can re-use the data
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('u.user', sep='|', header=None, names=user_cols, index_col='user_id', dtype={'zip_code':str})

Now try running the example code below, uncommenting each line one at a time to see what results

In [35]:
# examine the users data

#users                   # print the first 30 and last 30 rows
#type(users)             # DataFrame
#users.head()            # print the first 5 rows
#users.head(10)          # print the first 10 rows
#users.tail()            # print the last 5 rows
#users.index             # "the index" (aka "the labels")
#users.columns           # column names (which is "an index")
#users.dtypes            # data types of each column
#users.shape             # number of rows and columns
#users.values            # underlying numpy array
users.info()            # concise summary (including memory usage)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 943 entries, 1 to 943
Data columns (total 4 columns):
age           943 non-null int64
gender        943 non-null object
occupation    943 non-null object
zip_code      943 non-null object
dtypes: int64(1), object(3)
memory usage: 36.8+ KB


Now try some of the different methods for selecting a column from a dataframe by name.

In [23]:
# select a column

#users['gender']         # select one column
#type(users['gender'])   # Series
#users[['gender']]
#type(users[['gender']])   # DataFrame
#users.gender            # select one column using the DataFrame attribute


Now try some built in pandas methods for producing summary descriptive information from a dataframe.

In [43]:
# summarize (describe) the data

#users.describe()                    # describe all numeric columns
#users.describe(include=['object'])  # describe all object columns (can include multiple types)
users.describe(include='all')       # describe all columns
#users.age.describe()             # describe a single column
#users.age.mean()                    # only calculate the mean


Unnamed: 0,age,gender,occupation,zip_code
count,943.0,943,943,943.0
unique,,2,21,795.0
top,,M,student,55414.0
freq,,670,196,9.0
mean,34.051962,,,
std,12.19274,,,
min,7.0,,,
25%,25.0,,,
50%,31.0,,,
75%,43.0,,,


In [45]:
# count the number of occurrences of each value
users.occupation.value_counts()     # most useful for categorical variables
#users.age.value_counts()        # can also be used with numeric variables

student          196
other            105
educator          95
administrator     79
engineer          67
programmer        66
librarian         51
writer            45
executive         32
scientist         31
artist            28
technician        27
marketing         26
entertainment     18
healthcare        16
retired           14
salesman          12
lawyer            12
none               9
homemaker          7
doctor             7
Name: occupation, dtype: int64

# Student Exercises

## Excercise One

### Data

WHO alcohol consumption data:
    article: http://fivethirtyeight.com/datalab/dear-mona-followup-where-do-people-drink-the-most-beer-wine-and-spirits/    
    original data: https://github.com/fivethirtyeight/data/tree/master/alcohol-consumption
    file: drinks.csv (with additional 'continent' column)


In [48]:
# read drinks.csv into a DataFrame called 'drinks'
drinks = pd.read_table('drinks.csv', sep=',')
#drinks = pd.read_csv('drinks.csv')              # assumes separator is comma

In [52]:
# print the head and the tail
#drinks.head()
#drinks.tail()
#drinks.describe()
drinks.country.describe()

count         193
unique        193
top       Lesotho
freq            1
Name: country, dtype: object

In [55]:
# examine the default index, data types, and shape
drinks.index
#drinks.dtypes
#drinks.shape

RangeIndex(start=0, stop=193, step=1)

In [57]:
# print the 'beer_servings' Series
drinks['beer_servings']
#drinks.beer_servings


0        0
1       89
2       25
3      245
4      217
5      102
6      193
7       21
8      261
9      279
10      21
11     122
12      42
13       0
14     143
15     142
16     295
17     263
18      34
19      23
20     167
21      76
22     173
23     245
24      31
25     231
26      25
27      88
28      37
29     144
      ... 
163    128
164     90
165    152
166    185
167      5
168      2
169     99
170    106
171      1
172     36
173     36
174    197
175     51
176     51
177     19
178      6
179     45
180    206
181     16
182    219
183     36
184    249
185    115
186     25
187     21
188    333
189    111
190      6
191     32
192     64
Name: beer_servings, dtype: int64

In [60]:
# calculate the average 'beer_servings' for the entire dataset
drinks.describe()                   # summarize all numeric columns
#drinks.beer_servings.describe()     # summarize only the 'beer_servings' Series
#drinks.beer_servings.mean()         # only calculate the mean


Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
count,193.0,193.0,193.0,193.0
mean,106.160622,80.994819,49.450777,4.717098
std,101.143103,88.284312,79.697598,3.773298
min,0.0,0.0,0.0,0.0
25%,20.0,4.0,1.0,1.3
50%,76.0,56.0,8.0,4.2
75%,188.0,128.0,59.0,7.2
max,376.0,438.0,370.0,14.4


In [63]:

# count the number of occurrences of each 'continent' value and see if it looks correct
drinks.continent.value_counts()
#drinks.country.value_counts()


AF    53
EU    45
AS    44
OC    16
SA    12
Name: continent, dtype: int64

#### Filtering and Sorting

using users data set again

In [69]:
# logical filtering: only show users with age < 20
#young_bool = users.age < 20         # create a Series of booleans...
#users[young_bool]                   # ...and use that Series to filter rows
#users[users.age < 20]               # or, combine into a single step
#users[users.age < 20].occupation    # select one column from the filtered results
users[users.age < 20].occupation.value_counts()     # value_counts of resulting Series

student          64
other             4
none              3
writer            2
entertainment     2
salesman          1
artist            1
Name: occupation, dtype: int64

In [74]:
# logical filtering with multiple conditions
users[(users.age  >= 20) & (users.gender=='M')]       # ampersand for AND condition
#users[(users.age < 20) | (users.age > 60)]          # pipe for OR condition
#users[users.occupation.isin(['doctor', 'lawyer'])]  # alternative to multiple OR conditions


Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
3,23,M,writer,32067
4,24,M,technician,43537
6,42,M,executive,98101
7,57,M,administrator,91344
8,36,M,administrator,05201
9,29,M,student,01002
10,53,M,lawyer,90703
13,47,M,educator,29206
14,45,M,scientist,55106


In [77]:
# sorting
users.age.sort_values()                   # sort a column
#users.sort_values(by='age')                   # sort a DataFrame by a single column
#users.sort_values(by='age', ascending=False)  # use descending order instead
#users.sort_values(by=['occupation', 'age'])   # sort by multiple columns


user_id
30      7
471    10
289    11
880    13
609    13
142    13
674    13
628    13
813    14
206    14
887    14
849    15
281    15
461    15
618    15
179    15
101    15
57     16
580    16
550    16
451    16
434    16
621    17
619    17
761    17
375    17
904    17
646    17
582    17
257    17
       ..
90     60
308    60
931    60
752    60
469    60
464    60
234    60
694    60
934    61
351    61
106    61
520    62
266    62
858    63
777    63
364    63
845    64
423    64
318    65
651    65
564    65
211    66
349    68
573    68
559    69
585    69
767    70
803    70
860    70
481    73
Name: age, dtype: int64

## Excercise Two

using the drinks dataset again and refering to the code demonstrated above, write code to answer these questions:

In [86]:
# filter DataFrame to only include European countries
#drinks.head()
drinks[drinks.continent=='EU']

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
1,Albania,89,132,54,4.9,EU
3,Andorra,245,138,312,12.4,EU
7,Armenia,21,179,11,3.8,EU
9,Austria,279,75,191,9.7,EU
10,Azerbaijan,21,46,5,1.3,EU
15,Belarus,142,373,42,14.4,EU
16,Belgium,295,84,212,10.5,EU
21,Bosnia-Herzegovina,76,173,8,4.6,EU
25,Bulgaria,231,252,94,10.3,EU
42,Croatia,230,87,254,10.2,EU


In [89]:
# filter DataFrame to only include European countries with wine_servings > 300
drinks[(drinks.continent=='EU') & (drinks.wine_servings > 300)]

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
3,Andorra,245,138,312,12.4,EU
61,France,127,151,370,11.8,EU
136,Portugal,194,67,339,11.0,EU


In [102]:
# calculate the average 'beer_servings' for all of Europe
drinks[(drinks.continent=='EU')].beer_servings.mean()

193.77777777777777

In [109]:
# determine which 10 countries have the highest total_litres_of_pure_alcohol
drinks.sort_values(by='total_litres_of_pure_alcohol', ascending=False).head(10)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
15,Belarus,142,373,42,14.4,EU
98,Lithuania,343,244,56,12.9,EU
3,Andorra,245,138,312,12.4,EU
68,Grenada,199,438,28,11.9,
45,Czech Republic,361,170,134,11.8,EU
61,France,127,151,370,11.8,EU
141,Russian Federation,247,326,73,11.5,AS
81,Ireland,313,118,165,11.4,EU
155,Slovakia,196,293,116,11.4,EU
99,Luxembourg,236,133,271,11.4,EU


# Optional Excercises

### Merging Data

In [113]:
#help(pd.merge)
# read 'u.item' into 'movies'
movie_cols = ['movie_id', 'title']
movies = pd.read_table('u.item', sep='|', header=None, names=movie_cols, usecols=[0, 1])


In [111]:
# read 'u.data' into 'ratings'
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('u.data', sep='\t', header=None, names=rating_cols)

In [114]:
# merge 'movies' and 'ratings' (inner join on 'movie_id')
movie_ratings = pd.merge(movies, ratings)
movies.shape
ratings.shape
movie_ratings.shape

(100000, 5)

### Grouping Data

In [115]:
movie_ratings.columns

Index([u'movie_id', u'title', u'user_id', u'rating', u'timestamp'], dtype='object')

In [116]:
movie_ratings[['title', 'rating']].groupby('title').sum()

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
'Til There Was You (1997),21
1-900 (1994),13
101 Dalmatians (1996),317
12 Angry Men (1957),543
187 (1997),124
2 Days in the Valley (1996),300
"20,000 Leagues Under the Sea (1954)",252
2001: A Space Odyssey (1968),1028
3 Ninjas: High Noon At Mega Mountain (1998),5
"39 Steps, The (1935)",239


### Handling Missing Values

In [117]:
# missing values are usually excluded by default
#drinks.continent.value_counts()              # excludes missing values
drinks.continent.value_counts(dropna=False)  # includes missing values

AF     53
EU     45
AS     44
NaN    23
OC     16
SA     12
Name: continent, dtype: int64

In [120]:
# find missing values in a Series
#drinks.continent.isnull()           # True if missing, False if not missing
#drinks.continent.isnull().sum()     # count the missing values
#drinks.continent.notnull()          # True if not missing, False if missing
drinks[drinks.continent.isnull()]  # only show rows where continent is not missing

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
5,Antigua & Barbuda,102,128,45,4.9,
11,Bahamas,122,176,51,6.3,
14,Barbados,143,173,36,6.3,
17,Belize,263,114,8,6.8,
32,Canada,240,122,100,8.2,
41,Costa Rica,149,87,11,4.4,
43,Cuba,93,137,5,4.2,
50,Dominica,52,286,26,6.6,
51,Dominican Republic,193,147,9,6.2,
54,El Salvador,52,69,2,2.2,


In [121]:
# use 'tilde' ~ to negate the boolean values
~drinks.continent.isnull()  

0       True
1       True
2       True
3       True
4       True
5      False
6       True
7       True
8       True
9       True
10      True
11     False
12      True
13      True
14     False
15      True
16      True
17     False
18      True
19      True
20      True
21      True
22      True
23      True
24      True
25      True
26      True
27      True
28      True
29      True
       ...  
163     True
164     True
165     True
166     True
167     True
168     True
169     True
170     True
171     True
172     True
173     True
174    False
175     True
176     True
177     True
178     True
179     True
180     True
181     True
182     True
183     True
184    False
185     True
186     True
187     True
188     True
189     True
190     True
191     True
192     True
Name: continent, dtype: bool

In [None]:
# side note: understanding axes
drinks.sum(axis=0)      # sums "down" the 0 axis (rows)
drinks.sum()            # axis=0 is the default
drinks.sum(axis=1)      # sums "across" the 1 axis (columns)

In [129]:
# find missing values in a DataFrame
drinks.isnull()             # DataFrame of booleans
drinks.isnull().sum()       # count the missing values in each column

country                         0
beer_servings                   0
spirit_servings                 0
wine_servings                   0
total_litres_of_pure_alcohol    0
continent                       0
dtype: int64

In [128]:
# fill in missing values
#drinks.continent.fillna(value='NA')                 # fill in missing values with 'NA'
drinks.continent.fillna(value='NA', inplace=True)   # modifies 'drinks' in-place
drinks.head(100)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF
5,Antigua & Barbuda,102,128,45,4.9,
6,Argentina,193,25,221,8.3,SA
7,Armenia,21,179,11,3.8,EU
8,Australia,261,72,212,10.4,OC
9,Austria,279,75,191,9.7,EU
