# Powerful data structures for data analysis, time series, and statistics

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [6]:
labels = ['a', 'b', 'c']      #list
my_data = [10, 20, 30]
arr = np.array(my_data)       #array
d = {'a':10, 'b':20, 'c':30}  #dictionary

# Series

In [7]:
pd.Series(data= my_data)

0    10
1    20
2    30
dtype: int64

In [8]:
pd.Series(data=my_data, index=labels)

a    10
b    20
c    30
dtype: int64

In [9]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int32

In [10]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [13]:
pd.Series(data = [sum, print, len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [19]:
ser1 = pd.Series([1,2,3,4], ['USA', 'Germany', 'USSR', 'Japan'])  # usa is labelled index and 1 is data point and so on
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [20]:
ser2 = pd.Series([1,2,5,4], ['USA', 'Germany', 'India', 'Japan'])
ser2

USA        1
Germany    2
India      5
Japan      4
dtype: int64

In [21]:
ser1['USA']

1

In [22]:
ser3 = pd.Series(data = labels)

In [23]:
ser3[0]

'a'

In [25]:
ser1 + ser2   # didnt find a match so it put null(NaN) value, and others convert to float to retain all info possible

Germany    4.0
India      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

# DataFrames

In [107]:
import numpy as np
import pandas as pd

In [108]:
from numpy.random import randn

In [109]:
np.random.seed(101)

In [110]:
df = pd.DataFrame(randn(5,4), ['A','B','C','D','E'], ['W','X','Y','Z'])  # data, index, columns are the arguments, dataframe is a bunch of series that share the same index. 
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [111]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [112]:
df[['W', 'Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [113]:
type(df['W'])

pandas.core.series.Series

In [114]:
type(df)

pandas.core.frame.DataFrame

In [115]:
df['new'] = df['W'] + df['Y']  #adds a new column 'new'
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [116]:
df.shape

(5, 5)

In [117]:
df.drop('new', axis=1) # (rows, column):- rows have axis = 0 & column has axis = 0, by default .drop is set to axis = 0, so we need to make it 1

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [118]:
df.drop('E')  # for rows, default axis = 0 is already set so no need to pass axis

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542


In [119]:
df  # can see that new and E are not gone, this is because pandas dont want u to loose information, so we do below

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [120]:
df.drop('new', axis=1, inplace=True)  # inplace = True made the drop permanent
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [121]:
df  

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [123]:
df.loc['C']  # selection row using location

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [125]:
df.iloc[2]  # using index for selecting

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [127]:
df.loc['B', 'Y']  # value at row B and column Y

-0.8480769834036315

In [128]:
df.loc[['A', 'B'], ['W', 'Y']] #subset return of the dataframe

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


conditional selection and multindex part of dataframe

In [2]:
import numpy as np
import pandas as pd

In [3]:
from numpy.random import randn

In [4]:
np.random.seed(101)

In [5]:
df = pd.DataFrame(randn(5,4), ['A','B','C','D','E'], ['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [6]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [7]:
booldf = df>0

In [8]:
df[booldf]   # returns value for which is true and false for others

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [9]:
df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [10]:
df[df['W']>0] #returns those rows which happen to be true, here C is false in W

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [13]:
df[df['W']>0]['X']  #grabbing the X part of above

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [16]:
 boolser = df['W']>0
 result = df[boolser]
 result[['Y', 'X']]        # same as df[df['W']>0][['Y','X']]   will use this

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [17]:
df[(df['W']>0) & (df['Y']>1)]  # use & instead of and, coz df[(df['W']>0) and (df['Y']>1)] will throw an error, coz one side returns set of true false not just one true false

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [27]:
df[(df['W']>0) | (df['Y']>1)]   # use | instead of or to avoid error # multiple conditons pass in parenthesis

Unnamed: 0,W,X,Y,Z,state,city
A,2.70685,0.628133,0.907969,0.503826,CA,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY,NY
D,0.188695,-0.758872,-0.933237,0.955057,OR,OR
E,0.190794,1.978757,2.605967,0.683509,CO,CO


In [37]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z,state,city,cities
0,A,2.70685,0.628133,0.907969,0.503826,CA,CA,CA
1,B,0.651118,-0.319318,-0.848077,0.605965,NY,NY,NY
2,C,-2.018168,0.740122,0.528813,-0.589001,WY,WY,WY
3,D,0.188695,-0.758872,-0.933237,0.955057,OR,OR,OR
4,E,0.190794,1.978757,2.605967,0.683509,CO,CO,CO


In [38]:
newind = 'CA NY WY OR CO'.split()
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [39]:
df['cities'] = newind
df

Unnamed: 0,W,X,Y,Z,state,city,cities
A,2.70685,0.628133,0.907969,0.503826,CA,CA,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY,NY,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY,WY,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR,OR,OR
E,0.190794,1.978757,2.605967,0.683509,CO,CO,CO


In [42]:
df.set_index('cities') # these both set and rest are not inplace by default so the changes are not permanent

Unnamed: 0_level_0,W,X,Y,Z,state,city
cities,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CA,2.70685,0.628133,0.907969,0.503826,CA,CA
NY,0.651118,-0.319318,-0.848077,0.605965,NY,NY
WY,-2.018168,0.740122,0.528813,-0.589001,WY,WY
OR,0.188695,-0.758872,-0.933237,0.955057,OR,OR
CO,0.190794,1.978757,2.605967,0.683509,CO,CO


multi index and index heirarchy

In [44]:
import numpy as np
import pandas as pd

In [45]:
# Index Levels
outside = ['G1', 'G1', 'G1','G2', 'G2', 'G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside, inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [46]:
hier_index    

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [48]:
df = pd.DataFrame(randn(6,2), hier_index, ['A', 'B']) # creating multi index dataframe; G1, G2 one level then inside them 1 2 3 & 1 2 3 another level of indexing
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-0.497104,-0.75407
G1,2,-0.943406,0.484752
G1,3,-0.116773,1.901755
G2,1,0.238127,1.996652
G2,2,-0.993263,0.1968
G2,3,-1.136645,0.000366


In [50]:
df.loc['G1']

Unnamed: 0,A,B
1,-0.497104,-0.75407
2,-0.943406,0.484752
3,-0.116773,1.901755


In [51]:
df.loc['G1'].loc[1]

A   -0.497104
B   -0.754070
Name: 1, dtype: float64

In [52]:
df.index.names  # the 2 indices are unnamed currently

FrozenList([None, None])

In [53]:
df.index.names = ['Groups', 'Num']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.497104,-0.75407
G1,2,-0.943406,0.484752
G1,3,-0.116773,1.901755
G2,1,0.238127,1.996652
G2,2,-0.993263,0.1968
G2,3,-1.136645,0.000366


In [57]:
df.loc['G2'].loc[2]['B']  # more used thing # or df.loc['G2'].loc[2, 'B'] , to grab value from G2 -> 2 -> B 

0.19679950499134005

In [58]:
df.xs(1, level='Num')  #less used #helps us to grab a cross section easily, which would be tricky with .loc

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-0.497104,-0.75407
G2,0.238127,1.996652


Missing Data

In [59]:
import numpy as np
import pandas as pd

In [61]:
d = {'A':[1,2,np.nan], 'B':[5,np.nan,np.nan], 'C':[1,2,3]}  # dictionary
df = pd.DataFrame(d)
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [62]:
df.dropna()  # will drop the rows having nan values, default axis is 0

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [63]:
df.dropna(axis=1) # drops the column having any nan

Unnamed: 0,C
0,1
1,2
2,3


In [64]:
df.dropna(thresh=2) # sets threshold = 2 hence drops those rows that have >=2 nan

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [65]:
df.fillna(value='WHATEVERY U WANT TO FILL WITH')

Unnamed: 0,A,B,C
0,1,5,1
1,2,WHATEVERY U WANT TO FILL WITH,2
2,WHATEVERY U WANT TO FILL WITH,WHATEVERY U WANT TO FILL WITH,3


In [67]:
df['A'].fillna(value=df['A'].mean())  # na(nan) is filled by mean of 'A'

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

Groupby

In [68]:
import numpy as np
import pandas as pd

In [71]:
data = {'Company': ['GOOG', 'GOOG', 'MSFT', 'MSFT', 'FB', 'FB'],
        'Person': ['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Shashwat'],
        'Sales': [200, 120, 340, 124, 243, 350]}
        
df = pd.DataFrame(data)
df

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Shashwat,350


In [73]:
byCompany = df.groupby('Company')
byCompany.mean()  #mean cannot be found of the person column coz its a string

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,296.5
GOOG,160.0
MSFT,232.0


In [74]:
byCompany.sum()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,593
GOOG,320
MSFT,464


In [76]:
byCompany.sum().loc['FB']

Sales    593
Name: FB, dtype: int64

In [77]:
byCompany.std() #standard deviation

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,75.660426
GOOG,56.568542
MSFT,152.735065
