In [3]:
import pandas
pandas.__version__

'0.24.1'

In [4]:
import pandas as pd

In [5]:
#for built in pandas documentation
pd?

In [6]:
#In pandas series is 1D array of indexed data.
data = pd.Series([0.25,0.5,0.75,1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

as we can see above that we got both indices and values. you can access indices by data.index and values by data.values

In [7]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [8]:
#says the start, stop and step on index
data.index

RangeIndex(start=0, stop=4, step=1)

In [9]:
data[1]

0.5

In [10]:
data[1:3]

1    0.50
2    0.75
dtype: float64

you can also explicitly define index to you values in pandas Series.

In [11]:
data = pd.Series([0.25,0.5,0.75,1.0], index = ['a','b','c','d'])

In [12]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [14]:
#when index is le5,3,7tters, the use quotes while accesssing values.
data['b']

0.5

In [15]:
data = pd.Series([0.25,0.5,0.75,0.1], index = [2,5,3,7])

In [16]:
data

2    0.25
5    0.50
3    0.75
7    0.10
dtype: float64

In [18]:
#as index is number, so no quotes while accessing values.
data[5]

0.5

#### Series as dictionary

Dictionaries have both key and values. values are accessed by keys. dicts are represented with {}.


In [19]:
population_dict = {'california':9676240,
                   'Texas':26448193,
                   'New York':19651127,
                   'Florida':1955201}

In [20]:
#general way of accessing values with keys
population_dict['california']

9676240

In [21]:
#convert dict to series.
population = pd.Series(population_dict)
population

california     9676240
Texas         26448193
New York      19651127
Florida        1955201
dtype: int64

In [22]:
population['california']

9676240

Unlike a dictionary, though, the Series also supports array-style operations such as
slicing:

In [23]:
population['California':'Illinois']

Series([], dtype: int64)

#### pandas as Dataframe

In [24]:
area_dict = {'california':423967,
             'Texas':695662,
             'New york':141297,
             'Florida':170312,
             'Illinois':1499995
            }

In [25]:
area = pd.Series(area_dict)
area

california     423967
Texas          695662
New york       141297
Florida        170312
Illinois      1499995
dtype: int64

In [28]:
states = pd.DataFrame({'population':population,'area': area})
states

Unnamed: 0,population,area
Florida,1955201.0,170312.0
Illinois,,1499995.0
New York,19651127.0,
New york,,141297.0
Texas,26448193.0,695662.0
california,9676240.0,423967.0


In [29]:
states.index

Index(['Florida', 'Illinois', 'New York', 'New york', 'Texas', 'california'], dtype='object')

In [30]:
states.columns

Index(['population', 'area'], dtype='object')

In [31]:
states.values

array([[ 1955201.,   170312.],
       [      nan,  1499995.],
       [19651127.,       nan],
       [      nan,   141297.],
       [26448193.,   695662.],
       [ 9676240.,   423967.]])

In [33]:
states['area']

Florida        170312.0
Illinois      1499995.0
New York            NaN
New york       141297.0
Texas          695662.0
california     423967.0
Name: area, dtype: float64

#### Constructing Data frame from a 2D Numpy array

In [42]:
import numpy as np
pd.DataFrame(np.random.rand(3,2),
            columns = ['Foo','bar'],
            index = ['a','b','c'])

Unnamed: 0,Foo,bar
a,0.48141,0.012627
b,0.039032,0.284935
c,0.608256,0.667302


In [45]:
pd.DataFrame(np.random.randint(2,9,(3,2)),
            columns = ['Foo','bar'],
            index = ['a','b','c'])
#(2,9)is range and (3,2)is shape of matrix to be formed.

Unnamed: 0,Foo,bar
a,6,6
b,8,8
c,2,2


### Indexers: loc, iloc

In [49]:
data1 = pd.Series(['a','b','c'], index = [1,3,5])
data1

1    a
3    b
5    c
dtype: object

In [50]:
data1[1]

'a'

In [51]:
data1[1:3]

3    b
5    c
dtype: object

First, the loc attribute allows indexing and slicing that always references the explicit
index

In [53]:
data1.loc[1]

'a'

In [54]:
data1.loc[1:3]

1    a
3    b
dtype: object

In [56]:
data1.iloc[1]

'b'

In [58]:
data1.iloc[1:3]

3    b
5    c
dtype: object

In [59]:
data

2    0.25
5    0.50
3    0.75
7    0.10
dtype: float64

In [60]:
data.T

2    0.25
5    0.50
3    0.75
7    0.10
dtype: float64

In [61]:
data1

1    a
3    b
5    c
dtype: object

In [62]:
data1.T

1    a
3    b
5    c
dtype: object

we can only transposes DataFrames not Series

In [63]:
states

Unnamed: 0,population,area
Florida,1955201.0,170312.0
Illinois,,1499995.0
New York,19651127.0,
New york,,141297.0
Texas,26448193.0,695662.0
california,9676240.0,423967.0


In [64]:
states.T

Unnamed: 0,Florida,Illinois,New York,New york,Texas,california
population,1955201.0,,19651127.0,,26448193.0,9676240.0
area,170312.0,1499995.0,,141297.0,695662.0,423967.0


## operating on Null values

isnull():
Generate a Boolean mask indicating missing values

notnull():
Opposite of isnull()

dropna():
Return a filtered version of the data

fillna():
Return a copy of the data with missing values filled or imputed

In [66]:
data = pd.Series([1,np.nan,'hello',None])
data


0        1
1      NaN
2    hello
3     None
dtype: object

In [67]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [68]:
data.isna()

0    False
1     True
2    False
3     True
dtype: bool

In [69]:
#notnull(): Opposite of isnull()
data[data.notnull()]

0        1
2    hello
dtype: object

#### dropping null values

In [70]:
data.dropna()

0        1
2    hello
dtype: object

In [72]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [73]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [74]:
#forwardfill
data.fillna(method = 'ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [75]:
data.fillna(method = 'bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64