## Week 2: Pandas Series class, Time Series objects, Data Indexing, and Selecting of elements in Series objects.

# Pandas, Numpy

- pandas is an open source, easy-to-use data structures and data analysis tools for the Python programming language.
- Open source free software licenses impose minimal restrictions on the use and redistribution of covered software. 

In [23]:
from IPython.display import * 
display(HTML("<style>.container { width:100% !important; }</style>"))

import webbrowser as wb
wb.open('https://www.anaconda.com/download/#macos')
#wb.open('https://www.cnn.com/')

True

In [24]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)


1.1.3
1.20.2


In [4]:
! pip install --upgrade pip
! pip install --upgrade numpy



## Series
    - Series is a labeled array
    - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html

In [6]:
s = pd.Series([1, 5.5, 9])
print(s)

0    1.0
1    5.5
2    9.0
dtype: float64


In [7]:
s.describe()

count    3.000000
mean     5.166667
std      4.010403
min      1.000000
25%      3.250000
50%      5.500000
75%      7.250000
max      9.000000
dtype: float64

In [8]:
# If we like to see the values, then:
s.values

array([1. , 5.5, 9. ])

In [9]:
list(s.index)

[0, 1, 2]

In [10]:
s

0    1.0
1    5.5
2    9.0
dtype: float64

## Series with non-default index

In [11]:
s = pd.Series([75,82,88], index=['2020-08-01', '2020-08-12', '2020-08-16'])
s

2020-08-01    75
2020-08-12    82
2020-08-16    88
dtype: int64

In [12]:
s['2020-08-12']

82

In [13]:
s>80

2020-08-01    False
2020-08-12     True
2020-08-16     True
dtype: bool

In [14]:
s[s>80]

2020-08-12    82
2020-08-16    88
dtype: int64

In [15]:
'2020-08-12' in s

True

In [16]:
'2020-08-15' in s

False

## Insert new data

In [17]:
s['2020-10-15'] = 68
s

2020-08-01    75
2020-08-12    82
2020-08-16    88
2020-10-15    68
dtype: int64

In [18]:
s['2020-11-01'] = np.nan
s

2020-08-01    75.0
2020-08-12    82.0
2020-08-16    88.0
2020-10-15    68.0
2020-11-01     NaN
dtype: float64

In [19]:
pd.isnull(s)

2020-08-01    False
2020-08-12    False
2020-08-16    False
2020-10-15    False
2020-11-01     True
dtype: bool

In [20]:
pd.notnull(s)

2020-08-01     True
2020-08-12     True
2020-08-16     True
2020-10-15     True
2020-11-01    False
dtype: bool

## Multi-index

In [21]:
L1 = ['a', 'b']
L2 = [0,1]

index = [(x1, x2) for x1 in L1 for x2 in L2]

index = pd.MultiIndex.from_tuples(index, names=['One', 'Two'])

s = pd.Series([1,4,8,3], index=index)
print(s)



One  Two
a    0      1
     1      4
b    0      8
     1      3
dtype: int64


## Series Type

In [None]:
s = pd.Series([1,5,7,8], dtype='object')
s

## Series Name

In [None]:
pd.Series([6,8,3,5], name='California')

## Deep vs Shallow Copy 

In [None]:
s = pd.Series([6,8,3,5])
print(s)
s1 = s.copy(deep=False)
s[0] = 1000
s1

## Purely INTEGER-location based indexing for selection by position.

In [None]:
s = pd.Series([8,6,9,3], index = [100, 101, 102, 103])
print(s.iloc[1]) # Note that this is the innteger index
print(s.loc[101])

## Is monotonic / is monotonic decsreasing 

In [None]:
s = pd.Series([4,9,10,26])
s.is_monotonic_increasing

In [None]:
s = pd.Series([100, 25, 10, 7])
s.is_monotonic_decreasing

## Size

In [None]:
s = pd.Series([1,8,5])
s.size

## Absolute values

In [None]:
s = pd.Series([1,-5,7, -4])
s.abs()

## Add two series objects

In [None]:
s.add(s)

## Aggregation

In [None]:
print(s)
print(s.aggregate(func=np.sum))
print(s.aggregate(func=np.min))
print(s.aggregate(func=np.max))

## Any and all

In [None]:
s = pd.Series([1,5,7, 4])
print(all(s>0))
print(any(s==7))
print(any(s==10))

## Cumsum

In [None]:
s = pd.Series([1,5,7, 4])
print(s)
s.cumsum()

## argmin argmax

In [22]:
s = pd.Series([1,5,7, 4])

print(s.argmax())
print(s.argmin())

2
0


## Divide two series objects

In [None]:
s1 = pd.Series([5,10,25])
s2 = pd.Series([20, 70, 50])
s2.divide(s1)

## Dot product

In [None]:
s1 = pd.Series([3,0,2])
s2 = pd.Series([1, 7, 4])
s1.dot(s2)

## Find duplications

In [27]:
s = pd.Series([3,7,3,5,7])
s.duplicated()

0    False
1    False
2     True
3    False
4     True
dtype: bool

## drop duplications

In [None]:
s = pd.Series([3,7,3,5,7])
s.drop_duplicates()

## Head and Tail

In [None]:
s = pd.Series([3,7,3,5,7])
print(s.head(2))
print(s.tail(1))

## isin

In [25]:
s = pd.Series([3,7,3,5,7])
s.isin([10,7])

0    False
1     True
2    False
3    False
4     True
dtype: bool

## isna

In [26]:
s = pd.Series([3,np.nan, 5, np.nan])
print(s.isna())
print(s.notnull())

0    False
1     True
2    False
3     True
dtype: bool
0     True
1    False
2     True
3    False
dtype: bool


## quantile

In [None]:
s = pd.Series([3,np.nan, 5, np.nan])
s.quantile(q=[0.25, 0.75])

## Shift

In [None]:
s = pd.Series([3,8, 5, 4])
s.shift(2)

## Sampling

In [None]:
s = pd.Series([3,8, 5, 4])
s.sample(n=1)

## Mean & std

In [None]:
s = pd.Series([3,8, 5, 4])
print(s.mean())
print(s.std())

## to clipboard/ csv/ xlsx /json

In [None]:
s = pd.Series([3,8, 5, 4], name='XXX')
s.to_clipboard()
s.to_csv('test.csv')
s.to_excel('test.xlsx')
s.to_json()

## to_list

In [None]:
s = pd.Series([3,8, 5, 4], name='XXX')
s.to_list()

## Time series

In [None]:
s = pd.Series(['2021-01-15', '2021-02-25', '2021-01-07', '2021-01-10'], dtype='datetime64[ns]')
s.sort_values()

In [None]:
idx = pd.Series(['2021-01-15', '2021-02-25', '2021-01-07', '2021-01-10'], dtype='datetime64[ns]')
s = pd.Series([5,8,10,3], index = idx)
s
