In [117]:
import numpy as np
import pandas as pd

In [118]:
# create one item series
s1=pd.Series(2)
s1

0    2
dtype: int64

In [119]:
# create a series of multiple items from a list
# for multiple selection we have to use array notation ie [ ]
s2 = pd.Series([1,2,3,4,5])
s2

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [120]:
tax_data={"Sindh":3500,"Punjab":4500,"KPK":2000,"Balochistan":1000}
tax_revenue=pd.Series(tax_data,name="Tax Revenue")
print(tax_revenue)
tax_revenue = pd.Series(tax_data,index=['Punjab','Sindh','Balochistan','KPK'])
print(tax_revenue)
tax_revenue = pd.Series(tax_data,index=['Punjab','Sindh','Balochistan','KPK','GB'],name="Revenue by State")
tax_revenue.index.name="State"
print(tax_revenue)
pd.isnull(tax_revenue)
tax_revenue.isnull()

Sindh          3500
Punjab         4500
KPK            2000
Balochistan    1000
Name: Tax Revenue, dtype: int64
Punjab         4500
Sindh          3500
Balochistan    1000
KPK            2000
dtype: int64
State
Punjab         4500.0
Sindh          3500.0
Balochistan    1000.0
KPK            2000.0
GB                NaN
Name: Revenue by State, dtype: float64


State
Punjab         False
Sindh          False
Balochistan    False
KPK            False
GB              True
Name: Revenue by State, dtype: bool

In [121]:
# get the values in Series
s2.values

array([1, 2, 3, 4, 5])

In [122]:
# get the index of the series
s2.index

RangeIndex(start=0, stop=5, step=1)

In [123]:
# Series with name
sales = pd.Series([5,7,3,1,4,6,7], ['Mon','Tue','Wed','Thr','Fri','Sat','Sun'], name="Daily Canteen Sales")
sales

Mon    5
Tue    7
Wed    3
Thr    1
Fri    4
Sat    6
Sun    7
Name: Daily Canteen Sales, dtype: int64

In [124]:
#  explicitly create an index
#  index is alpha not integer
s3=pd.Series([1,2,3],index=['a','b','c'])
s3

a    1
b    2
c    3
dtype: int64

In [125]:
# lookup by lavel value, not integer position
print(f"Value by label s3[c] is {s3['c']} and value by index s3[2] is {s3[2]}")
# access both by label and index

Value by label s3[c] is 3 and value by index s3[2] is 3


In [126]:
#create series from an existing index
#scalar value will be copied at each index label
s4=pd.Series(['A','B','C','D','E'],index=s2.index)
s4

0    A
1    B
2    C
3    D
4    E
dtype: object

In [127]:
s5 = pd.Series(np.array([22,33,44,55,66]),index=s4.values)
s5

A    22
B    33
C    44
D    55
E    66
dtype: int64

In [128]:
# create series from dict
s4=pd.Series({'a':1,'b':2,'c':3,'d':4})
s4

a    1
b    2
c    3
d    4
dtype: int64

# Size, shape, uniqueness and counts of values

In [129]:
# example series, which also contain a NaN
s = pd.Series([0,1,1,2,3,4,4,5,6,7,np.nan])
#numpy NaN property is used to create a NaN
s

0     0.0
1     1.0
2     1.0
3     2.0
4     3.0
5     4.0
6     4.0
7     5.0
8     6.0
9     7.0
10    NaN
dtype: float64

In [130]:
print(f"len:{len(s)}")
print(f"size:{s.size}")
print(f"shape:{s.shape}")
print(f"count:{s.count()}") #not null value
print(f"unique:{s.unique()}")
print(f"value counts:{s.value_counts()}")

len:11
size:11
shape:(11,)
count:10
unique:[ 0.  1.  2.  3.  4.  5.  6.  7. nan]
value counts:4.0    2
1.0    2
7.0    1
6.0    1
5.0    1
3.0    1
2.0    1
0.0    1
dtype: int64


# Peeking at data with heads, tails and take

In [131]:
# first five
s.head()

0    0.0
1    1.0
2    1.0
3    2.0
4    3.0
dtype: float64

In [132]:
# first three
s.head(n=3)

0    0.0
1    1.0
2    1.0
dtype: float64

In [133]:
# last five
s.tail()

6     4.0
7     5.0
8     6.0
9     7.0
10    NaN
dtype: float64

In [134]:
#  last three
s.tail(n=3)

8     6.0
9     7.0
10    NaN
dtype: float64

In [135]:
# The .take() mrthod will return the row in a series that correspond to the zero-based
# position:  only take specific items
s.take([9,3,9])

9    7.0
3    2.0
9    7.0
dtype: float64

# Looking up values in Series

In [136]:
# single item lookup
print(s3)
s3['a']

a    1
b    2
c    3
dtype: int64


1

In [137]:
# Lookup by poition since the index is not an integer
s3[1]

2

In [138]:
# multiple items
s3[['a','c']]

a    1
c    3
dtype: int64

In [139]:
# series with an integer index, but not starting with 0
s5=pd.Series([1,2,3,4],index=[9,8,7,6])
s5
# print(s9[[values=3]])

9    1
8    2
7    3
6    4
dtype: int64

# Label based lookup vs Position based lookup

In [140]:
# 2/8 is considered as label based lookup
s5[8]

2

In [141]:
# if labels are string we can call by label and by index
# if labels are int than we can only call by label

In [142]:
# location - loc also works on label based lookup
s5.loc[2]

KeyError: 2

In [None]:
# instead we use integer location iloc OR INTEGER INDEXING
s5.iloc[2]

In [None]:
# multiple items by label
s5.loc[[9,7]]

In [None]:
s5[[8,9]]

# Alignment via index labels

A fundametal difference between a Numpy ndarry and a pandas Series is the ability of a Series to automatically align data from another Series based on label values before performing an operations

In [None]:
s6=pd.Series([1,2,3,4],index=['a','b','c','d'])
s6

In [None]:
s7=pd.Series([4,3,2,1], index=['d','c','b','a'])
s7

In [None]:
# add s6 and s7
# it aligns the data as per label then pefroms operation
s6+s7

# NaN+Number = NaN
# number + Nan = Nan

In [None]:
s8=pd.Series({'a':1,'b':2,'c':3,'d':5})
s8

In [None]:
s9=pd.Series({'b':6,'c':7,'d':9,'e':10})
s9

In [None]:
s8+s9

In [None]:
s10=pd.Series([1.0,2.0,3.0], index=['a','a','b'])
s10

In [None]:
s11=pd.Series([4.0,5.0,6.0],index=['a','a','c'])
s11

In [None]:
s10+s11

In [None]:
s101=pd.Series([11,22],index=['a','b'])
s101

In [None]:
s111=pd.Series([101,202,303],index=['a','b','c'])
s111

In [None]:
s101+s111

# The special case of NaN

In [None]:
# mean of numpy array values
nda=np.array([1,2,3,4,5])
nda.mean()

In [None]:
#mean of numpy array values with a NaN
nda=np.array([1,2,3,4,np.NaN])
nda.mean()

In [None]:
# ignores NaN values
s=pd.Series(nda)
print(s)
s.mean()

In [None]:
#handle Nan values like Numpy
s.mean(skipna=False)

# Boolean Selection

In [None]:
#which rows have values that are >5?
s = pd.Series(np.arange(0,10))
print(s>5)
# a little shorter version
print(s[s>5])

In [None]:
#select rows where values are >5
logicalResults=s>5
s[logicalResults]

In [None]:
# commented as it throws an exception
# s[s>5 and s<8]
# correct syntex
s[(s>5)&(s<8)]

In [None]:
pd.Series([True,False,False,True,True]).all(),pd.Series([True,False,False,True,True]).any()

In [None]:
(np.array([1,0,1,1])).sum()

In [None]:
#are all items>=0?
(s>=0).all()

In [None]:
s<2

In [None]:
# any item <2?
s[s<2].any()

In [None]:
# how many less than 3
(s <3).sum()

In [None]:
s[(s<3)].count()

# Indexing  - .index replaces the existing index

In [None]:
# sample series of five items
s=pd.Series(np.random.randn(5))
s

In [None]:
s.index=['a','b','c','d','e']
s

In [None]:
s1=pd.Series(np.random.randn(3))
s2=pd.Series(np.random.randn(3))
combine=pd.concat([s1,s2])
combine

In [None]:
#reset/reindexing the duplication of the combine series
combine.index=np.arange(0,len(combine))
combine

# .reindex() Method does not replace the exisiting index

In [None]:
s1=pd.Series(np.random.randn(4),['a','b','c','d'])
print(s1)
s2=s1.reindex(['d','g','c'])
print(s2)

In [None]:
combine.reindex([9,5,3,4,0,1,2,6]) # not in place

In [None]:
combine # original without reindexing above

In [None]:
# difference types for the same values of labels
# causes big trouble
s1=pd.Series([0,1,2],[0,1,2])
s2=pd.Series([3,4,5],['0','1','2'])
s1+s2

In [None]:
# reindex by casting the lable types and we will get the desired result
s2.index=s2.index.values.astype(int)
s1+s2

In [None]:
#fill with 0 instead of NaN
s2=s.copy()
s2

In [None]:
print(s2)
s2_reindexed=s2.reindex(['a','f'])
s2_reindexed

In [None]:
s2_reindexed=s2.reindex(['a','f'],fill_value=np.mean([1,2,3]))
s2_reindexed

# ffill, bfill & nearest

In [None]:
# create example to demonstrate fills
s3=pd.Series(['red','green','blue'],index=[0,8,10])
s3

In [None]:
# forward fill
s3.reindex(np.arange(0,15),method='ffill')

In [None]:
#backward fill
s3.reindex(np.arange(0,15),method='bfill')

In [None]:
# nearest fill
s3.reindex(np.arange(0,10),method='nearest')

# Slicing a series

In [None]:
s=pd.Series(np.arange(100,110),index=np.arange(10,20))
s

In [None]:
print(s[0:6:2])  #from,to,step
print(s.iloc[[0,2,4]]) #similar to above

In [None]:
print(s[:5]) # top first
print(s.head()) # similar to above


# Missing Data in Series

NaN represents data is missing

In [110]:
tax_data={"Sindh":3500,"Punjab":4500,"KPK":2000,"Balochistan":1000}
tax_revenue=pd.Series(tax_data,name="Tax Revenue")
print(tax_revenue)

Sindh          3500
Punjab         4500
KPK            2000
Balochistan    1000
Name: Tax Revenue, dtype: int64


In [108]:
 # Rearraning the index
tax_revenue = pd.Series(tax_data,index=['Punjab','Sindh','Balochistan','KPK'])
print(tax_revenue)

Punjab         4500
Sindh          3500
Balochistan    1000
KPK            2000
dtype: int64


In [143]:
#adding an index without value which results in NaN
tax_revenue = pd.Series(tax_data,index=['Punjab','Sindh','Balochistan','KPK','GB'],name="Revenue by State")
print(tax_revenue)

Punjab         4500.0
Sindh          3500.0
Balochistan    1000.0
KPK            2000.0
GB                NaN
Name: Revenue by State, dtype: float64


In [144]:
# finding null and notnul() value
print(pd.isnull(tax_revenue))
print(tax_revenue.isnull())
print(tax_revenue.notnull())

Punjab         False
Sindh          False
Balochistan    False
KPK            False
GB              True
Name: Revenue by State, dtype: bool
Punjab         False
Sindh          False
Balochistan    False
KPK            False
GB              True
Name: Revenue by State, dtype: bool
Punjab          True
Sindh           True
Balochistan     True
KPK             True
GB             False
Name: Revenue by State, dtype: bool


In [145]:
tax_revenue.name="Pak tax rev"
tax_revenue.index.name="Provience"
tax_revenue

Provience
Punjab         4500.0
Sindh          3500.0
Balochistan    1000.0
KPK            2000.0
GB                NaN
Name: Pak tax rev, dtype: float64