In [4]:
import pandas as pd

In [5]:
from pandas import Series, DataFrame

In [6]:

#The simplest Series is formed from only an array of data:
obj=pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [7]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [8]:
obj.index #range

RangeIndex(start=0, stop=4, step=1)

In [9]:
#Often it will be desirable to create a Series with an
#index identifying each data point with a label:
obj2=pd.Series([4, 7, -5, 3], index=['a','b','c','d'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [10]:
obj2.index# like range(4)

Index(['a', 'b', 'c', 'd'], dtype='object')

In [11]:
#Compared with NumPy arrays, you can use labels in the index when selecting single
#values or a set of values:
obj2['d']

3

In [12]:
obj2['c']=6 #change value of index
obj2

a    4
b    7
c    6
d    3
dtype: int64

In [13]:
obj2[obj2>0]

a    4
b    7
c    6
d    3
dtype: int64

In [14]:
obj2*2

a     8
b    14
c    12
d     6
dtype: int64

In [15]:
import numpy as np
np.exp(obj2)

a      54.598150
b    1096.633158
c     403.428793
d      20.085537
dtype: float64

In [16]:
#a fixed-length, ordered dict, as it is a mapping of 
#index values to data values


In [17]:
print(obj2)
print('b' in obj2) #find b in obj2
print('e' in obj2) #find e in obj2

a    4
b    7
c    6
d    3
dtype: int64
True
False


In [18]:
#Should you have data contained in a Python dict, 
#you can create a Series from it by passing the dict:
sdata={'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3=pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [19]:
states=['California', 'Ohio', 'Oregon', 'Texas',]
print(states)
print(sdata) #Utah is excluded in the result
obj4=pd.Series(sdata,index=states)
obj4

['California', 'Ohio', 'Oregon', 'Texas']
{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}


California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [20]:
#isnull and notnull functions in pandas should be used to detect missing data:
pd.isnull(obj4) #not a number NAN

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [21]:
pd.notnull(obj4) 

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [22]:
#Series also has these as instance methods:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [23]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [24]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [25]:
obj3+obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [26]:
obj4.name='population'
obj4.index.name='state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [27]:
#A Series’s index can be altered in-place by assignment:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [28]:
obj.index=['Bob','Steve','Jeff','Ryen']
obj

Bob      4
Steve    7
Jeff    -5
Ryen     3
dtype: int64

In [29]:
obj.index=['Bob','Steve','Jeff','Ryen'],reindex=['Bob','Sadaf','Jeff','Ryen']

SyntaxError: cannot assign to literal (1321983514.py, line 1)

In [30]:
#There are many ways to construct a DataFrame, though one of the most common is
#from a dict of equal-length lists or NumPy arrays:

data={'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame=pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [31]:
#For large DataFrames, the head method selects only the first five rows:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [32]:
#If you specify a sequence of columns, the DataFrame’s columns will be
#arranged in that order:
pd.DataFrame(data,columns=['year','state','pop'])


Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [33]:
#If you pass a column that isn’t contained in the dict,
#it will appear with missing values in the result:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
  index=['one', 'two', 'three', 'four','five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [34]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [35]:
#A column in a DataFrame can be retrieved as a Series either by dict-like 
#notation or by attribute:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [36]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [37]:
#GroupBy Mechanics
df=pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
 'key2' : ['one', 'two', 'one', 'two', 'one'],
 'data1' : np.random.randn(5),
 'data2' : np.random.randn(5)})
df


Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.588923,-1.378879
1,a,two,-0.671226,0.734793
2,b,one,0.768743,0.76949
3,b,two,-0.012988,0.948205
4,a,one,-1.553825,1.516841


In [38]:
grouped=df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000020161EEE190>

In [39]:
#means method
grouped.mean()

key1
a   -0.937992
b    0.377878
Name: data1, dtype: float64

In [40]:
#means=df['data1'].groupby([df['key1'],df['key2']]).means()
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one    -1.071374
      two    -0.671226
b     one     0.768743
      two    -0.012988
Name: data1, dtype: float64

In [41]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-1.071374,-0.671226
b,0.768743,-0.012988


In [42]:
years=np.array([2005, 2005, 2006, 2005, 2006])
years

array([2005, 2005, 2006, 2005, 2006])

In [43]:
states=np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])

In [44]:
states

array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'], dtype='<U10')

In [45]:
df['data1'].groupby([states, years]).mean()

California  2005   -0.671226
            2006    0.768743
Ohio        2005   -0.300956
            2006   -1.553825
Name: data1, dtype: float64

In [46]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.937992,0.290919
b,0.377878,0.858848


In [47]:
#ffill method
obj5=pd.Series(['blue', 'purple', 'yellow','green'], index=[0,2,4,5])
obj5
obj5.reindex(range(8), method='ffill')


0      blue
1      blue
2    purple
3    purple
4    yellow
5     green
6     green
7     green
dtype: object

In [48]:
frame=pd.DataFrame(np.arange(9).reshape((3, 3)),
 ....: index=['a', 'c', 'd'],
 ....: columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [49]:
frame2=frame.reindex(['a','b','c','d'])
frame2
#With DataFrame, reindex can alter either the (row) index, columns, or both. When
#passed only a sequence, it reindexes the rows in the result:


Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [50]:
#The columns can be reindexed with the columns keyword:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [67]:
#Dropping Entries from an Axis
obj6=pd.Series(np.arange(5.),index=['a', 'b', 'c', 'd', 'e'])
obj6

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [69]:
new_obj=obj.drop('c')
new_obj

a    0
b    1
d    3
e    4
dtype: int32

In [71]:
obj.drop(['c','d'])

a    0
b    1
e    4
dtype: int32

In [74]:
#With DataFrame, index values can be deleted from either axis. To illustrate this, we
#first create an example DataFrame:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [76]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [85]:
#You can drop values from the columns by passing axis=1 or axis='columns':
data.drop(['two','three'], axis='columns')


Unnamed: 0,one,four
Ohio,0,3
Colorado,4,7
Utah,8,11
New York,12,15


In [98]:
#Many functions, like drop, which modify the size or shape of a Series or DataFrame,
#can manipulate an object in-place without returning a new object:
"""obj6.drop('c',inplace=True)
obj6"""
#Be careful with the inplace, as it destroys any data that is dropped.

"obj6.drop('c',inplace=True)\nobj6"

# Indexing, Selection, and Filtering

In [104]:
#Series indexing (obj[...]) works analogously to NumPy array indexing,
#except you can use the Series’s index values instead of only integers. 
obj7=pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj7

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [108]:
obj7['b']

1.0

In [107]:
obj7[1]

1.0

In [109]:
obj7[2:4]

c    2.0
d    3.0
dtype: float64

In [110]:
obj7[['b','a','d']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [112]:
obj7[[1,3]]

b    1.0
d    3.0
dtype: float64

In [116]:
obj7=[obj<2]
obj7

[a     True
 b     True
 c    False
 d    False
 e    False
 dtype: bool]

In [124]:
#Slicing with labels behaves differently than normal Python slicing in that the end‐
#point is inclusive:
obj
obj['b':'c']

b    1
c    2
dtype: int32

In [127]:
#Setting using these methods modifies the corresponding section of the Series:
obj['b':'c']=5
obj

a    0
b    5
c    5
d    3
e    4
dtype: int32

In [130]:
#Indexing into a DataFrame is for retrieving one or more columns either with a single
#value or sequence:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [131]:
data[['three','one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [134]:
#Indexing like this has a few special cases. 
#First, slicing or selecting data with a boolean array:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [136]:
data[data['three']<5]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3


In [137]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [139]:
#Another use case is in indexing with a boolean DataFrame, such as one produced by a
#scalar comparison:
data<5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [141]:
data[data<5]=0
data
#This makes DataFrame syntactically more like a two-dimensional NumPy array in
#this particular case.


Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


# Selection with loc and iloc

In [145]:
#As a preliminary example, let’s select a single row and multiple columns by label:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [146]:
data.iloc[2,[3,0,1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

# Data Cleaning and Preparation