# Introduction to Data Science with Python 3

**The Zen of Python**

In [1]:
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


**Arithmetic functions**

In [2]:
3+2

5

In [3]:
4**2

16

In [4]:
15/4

3.75

**Variables**

In [5]:
A = 15

In [6]:
print(A)

15


## Python Libraries for Data Science

- NumPy
- SciPy
- Pandas
- Scikit-Learn

- matplotlib
- seaborn 
- plotly

**Import libraries**

In [7]:
import numpy as np 
import scipy as sp
import pandas as pd
import matplotlib as mpl

In [8]:
# import csv file from a web page
df = pd.read_csv('http://rcs.bu.edu/examples/python/data_analysis/Salaries.csv')

**Data Wrangling**

In [9]:
# data wrangling (in Spanish: manejo de datos)
df.head()

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800


In [10]:
df['salary'].dtype

dtype('int64')

In [11]:
df.dtypes

rank          object
discipline    object
phd            int64
service        int64
sex           object
salary         int64
dtype: object

In [12]:
df['sex']

0       Male
1       Male
2       Male
3       Male
4       Male
5       Male
6       Male
7       Male
8       Male
9       Male
10      Male
11      Male
12      Male
13      Male
14      Male
15      Male
16      Male
17      Male
18      Male
19      Male
20      Male
21      Male
22      Male
23      Male
24      Male
25      Male
26      Male
27      Male
28      Male
29      Male
       ...  
48    Female
49    Female
50    Female
51    Female
52    Female
53    Female
54    Female
55    Female
56    Female
57    Female
58    Female
59    Female
60    Female
61    Female
62    Female
63    Female
64    Female
65    Female
66    Female
67    Female
68    Female
69    Female
70    Female
71    Female
72    Female
73    Female
74    Female
75    Female
76    Female
77    Female
Name: sex, Length: 78, dtype: object

In [13]:
df.sex

0       Male
1       Male
2       Male
3       Male
4       Male
5       Male
6       Male
7       Male
8       Male
9       Male
10      Male
11      Male
12      Male
13      Male
14      Male
15      Male
16      Male
17      Male
18      Male
19      Male
20      Male
21      Male
22      Male
23      Male
24      Male
25      Male
26      Male
27      Male
28      Male
29      Male
       ...  
48    Female
49    Female
50    Female
51    Female
52    Female
53    Female
54    Female
55    Female
56    Female
57    Female
58    Female
59    Female
60    Female
61    Female
62    Female
63    Female
64    Female
65    Female
66    Female
67    Female
68    Female
69    Female
70    Female
71    Female
72    Female
73    Female
74    Female
75    Female
76    Female
77    Female
Name: sex, Length: 78, dtype: object

In [15]:
# import csv file from local host
df_boston = pd.read_csv('./data/salaries.csv')

In [16]:
df_boston.head()

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800


In [17]:
# group by rank 
df_rank = df.groupby(['rank'])

In [18]:
# calculate mean df_rank
df_rank.mean()

Unnamed: 0_level_0,phd,service,salary
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AssocProf,15.076923,11.307692,91786.230769
AsstProf,5.052632,2.210526,81362.789474
Prof,27.065217,21.413043,123624.804348


In [19]:
# group by sex
df_sex = df.groupby(['sex'])

# calculate mean of df_sex
df_sex.mean()

Unnamed: 0_level_0,phd,service,salary
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,16.512821,11.564103,101002.410256
Male,22.897436,18.538462,115045.153846


In [20]:
df.groupby('rank')[['salary']].mean()

Unnamed: 0_level_0,salary
rank,Unnamed: 1_level_1
AssocProf,91786.230769
AsstProf,81362.789474
Prof,123624.804348


In [21]:
# by professor rank
df.groupby(['rank'], sort = False)[['salary']].mean()

Unnamed: 0_level_0,salary
rank,Unnamed: 1_level_1
Prof,123624.804348
AssocProf,91786.230769
AsstProf,81362.789474


In [22]:
df_sub = df[df['salary'] > 120000]

In [23]:
df_sub

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
3,Prof,A,40,31,Male,131205
5,Prof,A,20,20,Male,122400
7,Prof,A,18,18,Male,126300
10,Prof,B,39,33,Male,128250
11,Prof,B,23,23,Male,134778
13,Prof,B,35,33,Male,162200
14,Prof,B,25,19,Male,153750
15,Prof,B,17,3,Male,150480
19,Prof,A,29,27,Male,150500


In [24]:
df_f = df[df['sex'] == 'Female']

In [25]:
df_f

Unnamed: 0,rank,discipline,phd,service,sex,salary
39,Prof,B,18,18,Female,129000
40,Prof,A,39,36,Female,137000
41,AssocProf,A,13,8,Female,74830
42,AsstProf,B,4,2,Female,80225
43,AsstProf,B,5,0,Female,77000
44,Prof,B,23,19,Female,151768
45,Prof,B,25,25,Female,140096
46,AsstProf,B,11,3,Female,74692
47,AssocProf,B,11,11,Female,103613
48,Prof,B,17,17,Female,111512


In [26]:
df[['salary', 'rank']]

Unnamed: 0,salary,rank
0,186960,Prof
1,93000,Prof
2,110515,Prof
3,131205,Prof
4,104800,Prof
5,122400,Prof
6,81285,AssocProf
7,126300,Prof
8,94350,Prof
9,57800,Prof


In [27]:
df[10:20]

Unnamed: 0,rank,discipline,phd,service,sex,salary
10,Prof,B,39,33,Male,128250
11,Prof,B,23,23,Male,134778
12,AsstProf,B,1,0,Male,88000
13,Prof,B,35,33,Male,162200
14,Prof,B,25,19,Male,153750
15,Prof,B,17,3,Male,150480
16,AsstProf,B,8,3,Male,75044
17,AsstProf,B,4,0,Male,92000
18,Prof,A,19,7,Male,107300
19,Prof,A,29,27,Male,150500


In [28]:
df_sub.loc[10:20, ['salary', 'rank', 'sex']]

Unnamed: 0,salary,rank,sex
10,128250,Prof,Male
11,134778,Prof,Male
13,162200,Prof,Male
14,153750,Prof,Male
15,150480,Prof,Male
19,150500,Prof,Male


### Flights file

In [29]:
flights = pd.read_csv('http://rcs.bu.edu/examples/python/data_analysis/flights.csv')

In [30]:
flights.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
0,2013,1,1,517.0,2.0,830.0,11.0,UA,N14228,1545,EWR,IAH,227.0,1400,5.0,17.0
1,2013,1,1,533.0,4.0,850.0,20.0,UA,N24211,1714,LGA,IAH,227.0,1416,5.0,33.0
2,2013,1,1,542.0,2.0,923.0,33.0,AA,N619AA,1141,JFK,MIA,160.0,1089,5.0,42.0
3,2013,1,1,554.0,-6.0,812.0,-25.0,DL,N668DN,461,LGA,ATL,116.0,762,5.0,54.0
4,2013,1,1,554.0,-4.0,740.0,12.0,UA,N39463,1696,EWR,ORD,150.0,719,5.0,54.0


In [31]:
flights[flights.isnull().any(axis=1)].head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
330,2013,1,1,1807.0,29.0,2251.0,,UA,N31412,1228,EWR,SAN,,2425,18.0,7.0
403,2013,1,1,,,,,AA,N3EHAA,791,LGA,DFW,,1389,,
404,2013,1,1,,,,,AA,N3EVAA,1925,LGA,MIA,,1096,,
855,2013,1,2,2145.0,16.0,,,UA,N12221,1299,EWR,RSW,,1068,21.0,45.0
858,2013,1,2,,,,,AA,,133,JFK,LAX,,2475,,


In [32]:
flights[['dep_delay', 'arr_delay']].agg(['min', 'mean', 'max'])

Unnamed: 0,dep_delay,arr_delay
min,-33.0,-75.0
mean,9.463773,2.094537
max,1014.0,1007.0


In [33]:
flights.describe()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,flight,air_time,distance,hour,minute
count,160754.0,160754.0,160754.0,158418.0,158418.0,158275.0,157927.0,160754.0,157927.0,160754.0,158418.0,158418.0
mean,2013.0,6.547395,15.716567,1316.146006,9.463773,1517.471161,2.094537,1156.344987,180.685158,1282.44542,12.837582,32.387847
std,0.0,3.410001,8.762794,470.823715,36.545109,510.695413,41.479349,695.884283,97.507866,765.895383,4.725552,18.687423
min,2013.0,1.0,1.0,1.0,-33.0,1.0,-75.0,1.0,21.0,17.0,0.0,0.0
25%,2013.0,4.0,8.0,855.0,-5.0,1112.0,-19.0,504.0,111.0,733.0,8.0,16.0
50%,2013.0,7.0,16.0,1345.0,-2.0,1541.0,-7.0,1157.0,153.0,1076.0,13.0,32.0
75%,2013.0,10.0,23.0,1725.0,7.0,1944.0,9.0,1715.0,258.0,1728.0,17.0,51.0
max,2013.0,12.0,31.0,2400.0,1014.0,2400.0,1007.0,2599.0,695.0,4963.0,24.0,59.0
