In [30]:
import pandas as pd

In [31]:
from pandas import Series, DataFrame

In [32]:

#The simplest Series is formed from only an array of data:
obj=pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [33]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [34]:
obj.index #range

RangeIndex(start=0, stop=4, step=1)

In [35]:
#Often it will be desirable to create a Series with an
#index identifying each data point with a label:
obj2=pd.Series([4, 7, -5, 3], index=['a','b','c','d'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [36]:
obj2.index# like range(4)

Index(['a', 'b', 'c', 'd'], dtype='object')

In [37]:
#Compared with NumPy arrays, you can use labels in the index when selecting single
#values or a set of values:
obj2['d']

3

In [38]:
obj2['c']=6 #change value of index
obj2

a    4
b    7
c    6
d    3
dtype: int64

In [39]:
obj2[obj2>0]

a    4
b    7
c    6
d    3
dtype: int64

In [40]:
obj2*2

a     8
b    14
c    12
d     6
dtype: int64

In [41]:
import numpy as np
np.exp(obj2)

a      54.598150
b    1096.633158
c     403.428793
d      20.085537
dtype: float64

In [42]:
#a fixed-length, ordered dict, as it is a mapping of 
#index values to data values


In [43]:
print(obj2)
print('b' in obj2) #find b in obj2
print('e' in obj2) #find e in obj2

a    4
b    7
c    6
d    3
dtype: int64
True
False


In [44]:
#Should you have data contained in a Python dict, 
#you can create a Series from it by passing the dict:
sdata={'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3=pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [45]:
states=['California', 'Ohio', 'Oregon', 'Texas',]
print(states)
print(sdata) #Utah is excluded in the result
obj4=pd.Series(sdata,index=states)
obj4

['California', 'Ohio', 'Oregon', 'Texas']
{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}


California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [46]:
#isnull and notnull functions in pandas should be used to detect missing data:
pd.isnull(obj4) #not a number NAN

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [47]:
pd.notnull(obj4) 

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [48]:
#Series also has these as instance methods:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [49]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [50]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [51]:
obj3+obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [52]:
obj4.name='population'
obj4.index.name='state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [53]:
#A Series’s index can be altered in-place by assignment:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [54]:
obj.index=['Bob','Steve','Jeff','Ryen']
obj

Bob      4
Steve    7
Jeff    -5
Ryen     3
dtype: int64

In [55]:
obj.index=['Bob','Steve','Jeff','Ryen'],reindex=['Bob','Sadaf','Jeff','Ryen']

SyntaxError: cannot assign to literal (1321983514.py, line 1)

In [56]:
#There are many ways to construct a DataFrame, though one of the most common is
#from a dict of equal-length lists or NumPy arrays:

data={'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame=pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [57]:
#For large DataFrames, the head method selects only the first five rows:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [58]:
#If you specify a sequence of columns, the DataFrame’s columns will be
#arranged in that order:
pd.DataFrame(data,columns=['year','state','pop'])


Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [59]:
#If you pass a column that isn’t contained in the dict,
#it will appear with missing values in the result:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
  index=['one', 'two', 'three', 'four','five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [60]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [61]:
#A column in a DataFrame can be retrieved as a Series either by dict-like 
#notation or by attribute:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [62]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [63]:
#GroupBy Mechanics
df=pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
 'key2' : ['one', 'two', 'one', 'two', 'one'],
 'data1' : np.random.randn(5),
 'data2' : np.random.randn(5)})
df


Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.765831,-0.200131
1,a,two,-0.165747,-0.12112
2,b,one,-0.381753,2.685682
3,b,two,0.601063,-0.442142
4,a,one,0.072132,0.679885


In [64]:
grouped=df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002987F7877F0>

In [65]:
#means method
grouped.mean()

key1
a   -0.286482
b    0.109655
Name: data1, dtype: float64

In [66]:
#means=df['data1'].groupby([df['key1'],df['key2']]).means()
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one    -0.346850
      two    -0.165747
b     one    -0.381753
      two     0.601063
Name: data1, dtype: float64

In [67]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.34685,-0.165747
b,-0.381753,0.601063


In [68]:
years=np.array([2005, 2005, 2006, 2005, 2006])
years

array([2005, 2005, 2006, 2005, 2006])

In [69]:
states=np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])

In [70]:
states

array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'], dtype='<U10')

In [71]:
df['data1'].groupby([states, years]).mean()

California  2005   -0.165747
            2006   -0.381753
Ohio        2005   -0.082384
            2006    0.072132
Name: data1, dtype: float64

In [72]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.286482,0.119544
b,0.109655,1.12177


In [73]:
#ffill method
obj5=pd.Series(['blue', 'purple', 'yellow','green'], index=[0,2,4,5])
obj5
obj5.reindex(range(8), method='ffill')


0      blue
1      blue
2    purple
3    purple
4    yellow
5     green
6     green
7     green
dtype: object

In [74]:
frame=pd.DataFrame(np.arange(9).reshape((3, 3)),
 ....: index=['a', 'c', 'd'],
 ....: columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [75]:
frame2=frame.reindex(['a','b','c','d'])
frame2
#With DataFrame, reindex can alter either the (row) index, columns, or both. When
#passed only a sequence, it reindexes the rows in the result:


Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [76]:
#The columns can be reindexed with the columns keyword:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [77]:
#Dropping Entries from an Axis
obj6=pd.Series(np.arange(5.),index=['a', 'b', 'c', 'd', 'e'])
obj6

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [78]:
new_obj=obj.drop('c')
new_obj

KeyError: "['c'] not found in axis"

In [79]:
obj.drop(['c','d'])

KeyError: "['c', 'd'] not found in axis"

In [80]:
#With DataFrame, index values can be deleted from either axis. To illustrate this, we
#first create an example DataFrame:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [81]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [82]:
#You can drop values from the columns by passing axis=1 or axis='columns':
data.drop(['two','three'], axis='columns')


Unnamed: 0,one,four
Ohio,0,3
Colorado,4,7
Utah,8,11
New York,12,15


In [83]:
#Many functions, like drop, which modify the size or shape of a Series or DataFrame,
#can manipulate an object in-place without returning a new object:
"""obj6.drop('c',inplace=True)
obj6"""
#Be careful with the inplace, as it destroys any data that is dropped.

"obj6.drop('c',inplace=True)\nobj6"

# Indexing, Selection, and Filtering

In [84]:
#Series indexing (obj[...]) works analogously to NumPy array indexing,
#except you can use the Series’s index values instead of only integers. 
obj7=pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj7

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [85]:
obj7['b']

1.0

In [86]:
obj7[1]

1.0

In [87]:
obj7[2:4]

c    2.0
d    3.0
dtype: float64

In [88]:
obj7[['b','a','d']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [89]:
obj7[[1,3]]

b    1.0
d    3.0
dtype: float64

In [90]:
obj7=[obj<2]
obj7

[Bob      False
 Steve    False
 Jeff      True
 Ryen     False
 dtype: bool]

In [91]:
#Slicing with labels behaves differently than normal Python slicing in that the end‐
#point is inclusive:
obj
obj['b':'c']

KeyError: 'b'

In [92]:
#Setting using these methods modifies the corresponding section of the Series:
obj['b':'c']=5
obj

KeyError: 'b'

In [93]:
#Indexing into a DataFrame is for retrieving one or more columns either with a single
#value or sequence:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [94]:
data[['three','one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [95]:
#Indexing like this has a few special cases. 
#First, slicing or selecting data with a boolean array:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [96]:
data[data['three']<5]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3


In [97]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [98]:
#Another use case is in indexing with a boolean DataFrame, such as one produced by a
#scalar comparison:
data<5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [99]:
data[data<5]=0
data
#This makes DataFrame syntactically more like a two-dimensional NumPy array in
#this particular case.


Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


# Selection with loc and iloc

In [100]:
#As a preliminary example, let’s select a single row and multiple columns by label:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [101]:
data.iloc[2,[3,0,1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

In [102]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [103]:
#We’ll then perform some similar selections with integers using iloc:
data.iloc[2,[3,0,1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

In [104]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [105]:
data.iloc[[1,2],[3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [108]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [109]:
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [110]:
data.iloc[:,:3][data.three>5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [120]:
ser = pd.Series(np.arange(3.))
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [124]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]

2.0

In [127]:
#To keep things consistent, if you have an axis index containing integers, data selection
#will always be label-oriented. For more precise handling, use loc (for labels) or iloc
#(for integers):
ser[:1]
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [128]:
ser.iloc[:1]

0    0.0
dtype: float64

# Data Cleaning and Preparation