# What is Pandas?

Pandas is a software library written for the Python programming language for data manipulation and analysis. 

# Pandas Data Objects

* Series
* DataFrame

## Series
A Pandas Series is a one-dimensional array of indexed data and it can be created from a list or array.

In [2]:
import pandas as pd

s = pd.Series([5,6,7,8])
s

0    5
1    6
2    7
3    8
dtype: int64

In [2]:
data = pd.Series([1,2,3,4,5], dtype='float')
data

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64

In [4]:
# some attributes
data.values

array([ 1.,  2.,  3.,  4.,  5.])

In [4]:
data.index

RangeIndex(start=0, stop=5, step=1)

### Series Indexing

In [5]:
# similar to arrays
data[1]

2.0

### Series Slicing

In [6]:
# similar to arrays
data[1:4]

1    2.0
2    3.0
3    4.0
dtype: float64

In [7]:
# we can specify special indices for series, unlike arrays with integer indices
d = pd.Series([1,2,3,4,5],index=['a','b','c','d','e'])
d

a    1
b    2
c    3
d    4
e    5
dtype: int64

## Converting Dictionary to Series

In [8]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [9]:
# indexing
population['Florida']

19552860

In [10]:
# slicing
population['California':'Illinois']

California    38332521
Florida       19552860
Illinois      12882135
dtype: int64

## Series as a dictionary

In [4]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a','b','c','d'])

In [5]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [6]:
# indexing
data['c']

0.75

In [7]:
# searching for an index
'a' in data

True

In [8]:
'z' in data

False

In [9]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [10]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [11]:
# extension of Series
data['e'] = 1.23

In [12]:
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.23
dtype: float64

In [13]:
# slicing by explicit index
data['a':'d']

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [14]:
# slicing by implicit integer index
data[1:4]

b    0.50
c    0.75
d    1.00
dtype: float64

In [15]:
# masking: setting conditions
print(data)
data[data>0.5]

a    0.25
b    0.50
c    0.75
d    1.00
e    1.23
dtype: float64


c    0.75
d    1.00
e    1.23
dtype: float64

In [16]:
n = pd.Series(['banana','lemon','apple','pear','orange'], index=[0,1,2,3,4])
n

0    banana
1     lemon
2     apple
3      pear
4    orange
dtype: object

In [18]:
n.loc[0]

'banana'

In [47]:
n.loc[2:3]

2    apple
3     pear
dtype: object

## Pandas Dataframe

DataFrame is a 2-dimensional labeled data structure with columns and rows. You can think of it as a spreadsheet or SQL table, or a dict of Series objects. A column in a DataFrame is a Series object.

In [11]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64

In [12]:
states = pd.DataFrame({'population': population,
                       'area': area})
states

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [13]:
states.index

Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

In [14]:
states.columns

Index(['area', 'population'], dtype='object')

In [15]:
# from a 2D Numpy Array
import numpy as np

x = np.random.rand(3, 2)
print(x)

pd.DataFrame(x, columns=['Column 1', 'Column 2'],
             index=['a', 'b', 'c'])



[[ 0.35398256  0.03137288]
 [ 0.33969103  0.40547418]
 [ 0.81308996  0.23304147]]


Unnamed: 0,Column 1,Column 2
a,0.353983,0.031373
b,0.339691,0.405474
c,0.81309,0.233041


In [20]:
fname = {0:'Joseph', 1:'William', 2:'Kwadwo',3:'Kobina'}
lname = {0:'Yeboah', 1:'Kyei', 2:'Owusu', 3:'Paintsil'}

firstName = pd.Series(fname)
lastName = pd.Series(lname)

biodata = pd.DataFrame({'firstName':firstName, 'lastName':lastName})
biodata

Unnamed: 0,firstName,lastName
0,Joseph,Yeboah
1,William,Kyei
2,Kwadwo,Owusu
3,Kobina,Paintsil


In [21]:
biodata.firstName

0     Joseph
1    William
2     Kwadwo
3     Kobina
Name: firstName, dtype: object

In [22]:
biodata.lastName

0      Yeboah
1        Kyei
2       Owusu
3    Paintsil
Name: lastName, dtype: object

In [23]:
biodata['firstName']

0     Joseph
1    William
2     Kwadwo
3     Kobina
Name: firstName, dtype: object

In [24]:
# slicing dataframe
biodata['firstName'].loc[1:3]

1    William
2     Kwadwo
3     Kobina
Name: firstName, dtype: object

In [55]:
# adding a new column
biodata['fullName'] = biodata['firstName'] + ' ' + biodata['lastName']
biodata

Unnamed: 0,firstName,lastName,fullName
0,Joseph,Yeboah,Joseph Yeboah
1,William,Kyei,William Kyei
2,Kwadwo,Owusu,Kwadwo Owusu
3,Kobina,Paintsil,Kobina Paintsil


In [61]:
# slicing
biodata.loc[1:3]

Unnamed: 0,firstName,lastName,fullName
1,William,Kyei,William Kyei
2,Kwadwo,Owusu,Kwadwo Owusu
3,Kobina,Paintsil,Kobina Paintsil


In [64]:
# indexing
biodata.loc[0]

firstName           Joseph
lastName            Yeboah
fullName     Joseph Yeboah
Name: 0, dtype: object

In [25]:
# calculations on columns
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [26]:
# adding another column called density
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [27]:
# masking: setting conditions
data[data.density > 100]

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
New York,141297,19651127,139.076746


In [29]:
# masking does not replace original dataframe
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874
