(CONFIDENTIAL) INTERNAL USE ONLY, NOT FOR EXTERNAL DISTRIBUTION

<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [None]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

# Jupyter Notebook

Basic concept: 

* What is the environment (how it works)
* What is a cell (code/markdown)
* Basic notebook operations

In [2]:
# short-cut:
# insert below (B)
# insert above (A)
# delete current cell (dd)
# shortcut help (h)
# run current cell (Ctr + Enter)
# run current cell, then move to the next cell (Shift + Enter)

In [None]:
# 0 
# calculator 2 
# define advanced structure type (list, set) + 2
# define function + 2
# define class, module + 2
# package + 2



# Python Basics

In [None]:
# https://www.toptal.com/python/python-3-is-it-worth-the-switch
# https://blog.appdynamics.com/engineering/the-key-differences-between-python-2-and-python-3/

In [3]:
# list, set, tuple, dict
# have better understanding on the data structure

10

# Numpy and some important packages

In [None]:
# Key data structure: numpy array

# packages like datetime, random are important

In [4]:
import numpy as np

In [5]:
[1, 'sdf', lambda x: x*2]

[1, 'sdf', <function __main__.<lambda>>]

In [15]:
a = np.array([1., 2, 3])

In [17]:
# a.

2.0

In [None]:
# np.random.

## Datetime module

In [None]:
# Datetime is usually needed for any time series related analysis
# Three objects: datetime, date, time

# key points: 
# 1. timezone aware vs no-aware
# 2. use pytz, change time zone (astimezone)
# 3. difference time 

In [18]:
import datetime
from pytz import timezone

In [19]:
t1 = datetime.datetime(2017,8,3)
t2 = datetime.datetime(2018,10,6)

In [22]:
type(t1)

datetime.datetime

In [23]:
type(t2-t1)

datetime.timedelta

In [21]:
(t2-t1).days

429

In [None]:
t1 = datetime.datetime(2016,12,1)
t2 = datetime.datetime(2016,10,1, tzinfo=timezone('UTC'))

In [None]:
# type of t1, t2
# t1 - t2
# astimezone
# t1 - t2
# time delta

## Regular Expression module

In [None]:
# regular expression is very important for text processing

# Two APIs: re.XXX(pattern, STRING) or pattern.XXX(STRING)
# re.match (start of the string)
# re.search (any place in the string, find first one)
# re.findall/finditer (any place in the string, all ones)
# re.split (Split string by the occurrences of pattern)

In [24]:
import re

In [27]:
# RE.MATCH
print(re.match('AA', 'AAbc'))
print(re.match('AA', 'bcAA'))
print(re.match('AA', 'AAAbcAA'))

<_sre.SRE_Match object; span=(0, 2), match='AA'>
None
<_sre.SRE_Match object; span=(0, 2), match='AA'>


In [28]:
# RE.COMPILE
pattern = re.compile('AA')
print(pattern.match('AAbc'))
print(pattern.match('bcAA'))

<_sre.SRE_Match object; span=(0, 2), match='AA'>
None


In [29]:
# RE.SEARCH
pattern = re.compile('AA')
print(pattern.search('AAbc'))
print(pattern.search('bcAA'))

<_sre.SRE_Match object; span=(0, 2), match='AA'>
<_sre.SRE_Match object; span=(2, 4), match='AA'>


In [32]:
# RE.FINDALL
pattern = re.compile('A+A')
print(pattern.findall('AAAAbc'))
print(pattern.findall('bcAA'))

['AAAA']
['AA']


In [33]:
# RE.FINDALL
pattern = re.compile('AA')
print([x for x in pattern.finditer('AAbc')])
print([x for x in pattern.finditer('bcAA')])

[<_sre.SRE_Match object; span=(0, 2), match='AA'>]
[<_sre.SRE_Match object; span=(2, 4), match='AA'>]


In [34]:
# RE.SPLIT
pattern = re.compile('AA')
print(pattern.split('AAbc'))
print(pattern.split('bcAA'))

['', 'bc']
['bc', '']


In [35]:
# RE.SUB
pattern = re.compile('AA')
print(pattern.sub('BB', 'AAbc'))
print(pattern.sub('BB', 'bcAA'))

BBbc
bcBB


In [36]:
pattern = re.compile('AA')
the_str = 'AAbc'

In [37]:
%%timeit
for i in range(10000):
    if pattern.search(the_str):
        b = the_str.replace('AA', 'BB')

100 loops, best of 3: 5.6 ms per loop


In [38]:
%%timeit
for i in range(10000):
    b = pattern.sub('BB', the_str)

100 loops, best of 3: 4.88 ms per loop


In [None]:
# Match Email pattern: 
re.match('^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$', 'abc@gmail.com').group()

## MultiProcess module

In [None]:
# Multithreading in Python does NOT help with CPU heavy job, it only help with IO heavy job
# due to CPython implementation's Global Interpreter Lock (GIL)

In [None]:
# How to easily use multiProcessing? Pool

In [39]:
from multiprocessing import Pool
import time

In [40]:
# Define a function which is more CPU heavy
def is_prime(num):
    if num <= 1:
        return False
    elif num <= 3:
        return True
    elif num%2 == 0 or num%3 == 0:
        return False
    i = 5
    while i*i <= num:
        if num%i == 0 or num%(i+2) == 0:
            return False
        i += 6
    return True

def sum_prime(num):
    
    sum_of_primes = 0

    ix = 2

    while ix <= num:
        if is_prime(ix):
            sum_of_primes += ix
        ix += 1

    return sum_of_primes

In [41]:
# running in one Core
start = time.time()
print(list(map(sum_prime, [300000, 600000, 900000])))
print("Time taken = {0:.5f}".format(time.time() - start))

[3709507114, 14071826345, 30689332265]
Time taken = 4.06532


In [45]:
# change from 1 -> 2, speed is faster
start = time.time()
with Pool(2) as p:
    print(p.map(sum_prime, [300000, 600000, 900000]))
print("Time taken = {0:.5f}".format(time.time() - start))

[3709507114, 14071826345, 30689332265]
Time taken = 2.89471


In [None]:
# Time series analysis

# Pandas: data analytics

## Series

In [46]:
import pandas as pd

In [53]:
a = pd.Series(data=[1, 2, 3], index=['A', 'B', 'C'], name="S1")

In [54]:
a

A    1
B    2
C    3
Name: S1, dtype: int64

In [61]:
a.index

Index(['A', 'B', 'C'], dtype='object')

In [59]:
type(a.values)

numpy.ndarray

In [58]:
a.name

'S1'

### Attribute

In [None]:
# type related attributes & operation
# dtype, access element (ix, iloc, loc), shape, size, index, values

In [63]:
a.shape

(3,)

In [64]:
a.size

3

In [66]:
a.loc['A']

1

In [67]:
a.iloc[0]

1

In [68]:
a.iloc[0:2]

A    1
B    2
Name: S1, dtype: int64

In [69]:
a.loc['A':'B']

A    1
B    2
Name: S1, dtype: int64

### Method

In [None]:
# operation related
# astype, describe, head, tail, isin, functions & apply, statistical functions
# nunique, unique, duplicated, drop_duplicates, repeat, sample
# basic plot
# sort_index, sort_values

In [70]:
a.describe()

count    3.0
mean     2.0
std      1.0
min      1.0
25%      1.5
50%      2.0
75%      2.5
max      3.0
Name: S1, dtype: float64

In [72]:
a.isin([1, 2])

A     True
B     True
C    False
Name: S1, dtype: bool

In [73]:
a.apply(lambda x: x**2)

A    1
B    4
C    9
Name: S1, dtype: int64

In [80]:
b = a.copy()

In [81]:
a.loc[2] = 10

In [82]:
a

A     1
B     2
C     3
1    10
2    10
Name: S1, dtype: int64

In [83]:
b

A     1
B     2
C     3
1    10
Name: S1, dtype: int64

In [None]:
# visualization

## DataFrame

### Attribute

In [89]:
df = pd.DataFrame(data=[[1,2,3], [4,5,6]], index=['A', 'B'], columns=['a','b','c'])

In [92]:
df

Unnamed: 0,a,b,c
A,1,2,3
B,4,5,6


In [91]:
df.loc['A']

a    1
b    2
c    3
Name: A, dtype: int64

In [93]:
df['a']

A    1
B    4
Name: a, dtype: int64

In [94]:
df.loc['A', 'a']

1

In [95]:
df.shape

(2, 3)

In [96]:
df.size

6

### Method

In [None]:
# describe, operation (+-*/), apply, set_index/value
# where, mask (fillna)
# sort_index, sort_value
# query

In [98]:
df.apply(lambda x: x**2)

Unnamed: 0,a,b,c
A,1,4,9
B,16,25,36


In [99]:
df['d'] = df['a'] * 2

In [104]:
df.query("(a>2) and (b < 8)")[['a','c']]

Unnamed: 0,a,c
B,4,6


In [None]:
# 为什么两个中括号 我也碰到很多这种情况
# 老师能讲讲list comprehension 概念和use case吗
# axis =0 ， axis =1 哪个是横着 哪个是竖着 老搞混
# pd 底下 直接 join 和 query 里面 join 是不是 都可以达到同样的效果

In [106]:
a = [1, 2, 3]

In [107]:
[x*2 for x in range(10)]

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

In [108]:
a = []
for i in range(10):
    a.append(i*2)

In [110]:
df

Unnamed: 0,a,b,c,d
A,1,2,3,2
B,4,5,6,8


In [111]:
df.shape

(2, 4)

In [114]:
df.sum(axis=1)

A     8
B    23
dtype: int64

In [116]:
df.join?

In [117]:
pd.merge?

In [None]:
# select
# filter (where)   => subset
# join

# Pandas: Case Studies

## Titanic example
Data Resource: https://www.kaggle.com/c/titanic

In [118]:
df = pd.read_csv('data/titanic.csv')

In [None]:
# Exploration (basic statistics)
# Selection and indexing
# Missing data
# Gender vs. Survival (pivot_table)
# Age vs. survival (cut)
# Split-apply-combine
# Merge, join

In [120]:
df.shape

(891, 12)

In [121]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [122]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [125]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [126]:
df1 = df.drop('Cabin', axis=1)

In [127]:
df1.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

In [133]:
# df1['Embarked'].isnull()

In [141]:
df2 = df1.loc[lambda x: x['Embarked'].notnull()]

In [142]:
df2.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         0
dtype: int64

In [137]:
df3 = df2.fillna(30)

In [139]:
df3.isna().sum()
print(df3.shape)

(889, 11)


In [147]:
df1 = df.drop('Cabin', axis=1)\
    .loc[lambda x: x['Embarked'].notnull()]\
    .fillna(30)

In [154]:
df1.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [151]:
df1.groupby('Sex').sum()['Survived']

Sex
female    231
male      109
Name: Survived, dtype: int64

In [156]:
df1.groupby('Sex').agg({'Survived': ['sum', 'count'], 'PassengerId': 'count'})

Unnamed: 0_level_0,Survived,Survived,PassengerId
Unnamed: 0_level_1,sum,count,count
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,231,312,312
male,109,577,577


## Tweets Example
Twitter data was scraped from February of 2015 and contributors were asked to first classify positive, negative, and neutral tweets, followed by categorizing negative reasons (such as "late flight" or "rude service").

Data Resource: https://www.kaggle.com/crowdflower/twitter-airline-sentiment

In [None]:
import pandas as pd
tw = pd.read_csv('data/tweets.csv')

In [None]:
tw.dtypes

In [None]:
# Step 1. data quality check
# if each tweeter ID is 18 numbers

# df.tweet_id.apply(lambda x: True if re.match(r'^\d{18}$',str(x)) else False)
# second way to check the tweeter ID are 18 numbers
# tw['tweet_id'] = tw['tweet_id'].astype(str)
# match = [True if re.match(r'^\d{18}$', eachitem) else False for eachitem in tw['tweet_id'] ]

# match the negative reason is "Flight Booking Problems"
tw['negativereason'] = tw['negativereason'].astype(str)
match_reason1 = [True if re.match(r'Flight Booking Problems', eachitem) else False for eachitem in tw['negativereason'] ]

# negative reason related with flight.
tw['negativereason'] = tw['negativereason'].astype(str)
match_reason2 = [True if re.match(r'Flight', eachitem) else False for eachitem in tw['negativereason'] ]

# negative reason related with flight.
tw['negativereason'] = tw['negativereason'].astype(str)
search_reason = [True if re.search(r'Flight', eachitem) else False for eachitem in tw['negativereason'] ]

# search the negative reason related with service
search_reason2 = [True if re.search(r'Service', eachitem) else False for eachitem in tw['negativereason'] ]

# search the tweets content related with flight cancel.
tw['text'] = tw['text'].astype(str)
search_text = [True if re.search(r'[Cc][Aa][Nn][Cc][Ee][Ll][Ll][Ee][Dd]|[Cc][Aa][Nn][Cc][Ee][Ll]', eachitem) else False for eachitem in tw['text'] ]

In [None]:
# re.I/re.IGNORECASE	Perform case-insensitive matching.
search_text2 = [True if re.search(r'cancel|cancelled', eachitem,re.I) else False for eachitem in tw['text'] ]

split = re.split('\s+', str(tw['text']))

# replace all unreadable words or emoji
p = re.compile(r'[^A-Za-z0-9_|^\s?|^\.|^\:|\/]')
tw['text'] = [re.sub(p, "", eachitem) for eachitem in tw['text'] ]

In [None]:
# dtypes conversion
# operation on single columns
# operation on multiple columns
# RE over text (extract identity, etc)

## Time Series Example
This dataset contains complete records of daily rainfall patterns from January 1st, 1948 to December 12, 2017. Data Source: https://www.kaggle.com/rtatman/did-it-rain-in-seattle-19482017

This data was collected at the Seattle-Tacoma International Airport. The dataset contains five columns:

DATE = the date of the observation
PRCP = the amount of precipitation, in inches
TMAX = the maximum temperature for that day, in degrees Fahrenheit
TMIN = the minimum temperature for that day, in degrees Fahrenheit
RAIN = TRUE if rain was observed on that day, FALSE if it was not

In [None]:
df = pd.read_csv('data/rain.csv')

In [None]:
# Step 1. data quality check
# is DATE unique? is it complete? 
# is PRCP and RAIN consistent?
# is TMAX/TMIN reasonable
# is PRCP reasonable
# is RAIN value correct?

# df.DATE.map(lambda x: x[0:4]).value_counts().sort_index().plot()
# value_count, plot, to_datetime, set_index, etc.

In [None]:
# Step 2. understand the trend
# rolling window
# aggregation
# UTC time zone awareness 

In [None]:
from datetime import datetime
from pytz import timezone

fmt = "%Y-%m-%d %H:%M:%S %Z%z"

# Current time in UTC
now_utc = datetime.now(timezone('UTC'))
print (now_utc.strftime(fmt))

# Convert to US/Pacific time zone
now_pacific = now_utc.astimezone(timezone('US/Pacific'))
print (now_pacific.strftime(fmt))

# Convert to Europe/Berlin time zone
now_berlin = now_pacific.astimezone(timezone('Europe/Berlin'))
print (now_berlin.strftime(fmt))