# Python Pandas

## Series
The difference between numpy array and series is that the series can be labelled and can be accessed by this label

In [32]:
import pandas as pd
import numpy as np

In [3]:
label = ['a', 'b', 'c']

In [4]:
my_data = [10, 20, 30]

In [5]:
arr = np.array(my_data)

In [6]:
data_dict = {'a': 10, 'b': 20, 'c': 30}

In [7]:
pd.Series(data=my_data)

0    10
1    20
2    30
dtype: int64

In [8]:
pd.Series(data=my_data, index=label)

a    10
b    20
c    30
dtype: int64

In [10]:
pd.Series(arr, label)

a    10
b    20
c    30
dtype: int64

In [11]:
pd.Series(data_dict)

a    10
b    20
c    30
dtype: int64

In [12]:
pd.Series(data=label)

0    a
1    b
2    c
dtype: object

In [13]:
series_1 = pd.Series([1, 2, 3, 4], ['USA', 'Germany', 'USSR', 'Japan'])

In [14]:
series_1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [15]:
series_2 = pd.Series([1, 2, 5, 4], ['USA', 'Germany', 'Italy', 'Japan'])

In [16]:
series_2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [18]:
series_1 + series_2

Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

## Dataframes
It is a combination of Series

In [34]:
from numpy.random import rand
np.random.seed(101)

In [35]:
df = pd.DataFrame(rand(5,4), ['A', 'B', 'C', 'D', 'E'], ['W', 'X', 'Y', 'Z'])

In [36]:
df

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,0.028474,0.171522
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,0.189939,0.554228,0.352132
D,0.181892,0.785602,0.965483,0.232354
E,0.083561,0.603548,0.728993,0.276239


In [37]:
df['W']

A    0.516399
B    0.685277
C    0.721544
D    0.181892
E    0.083561
Name: W, dtype: float64

In [38]:
type(df['W'])

pandas.core.series.Series

In [39]:
df.W

A    0.516399
B    0.685277
C    0.721544
D    0.181892
E    0.083561
Name: W, dtype: float64

In [41]:
df[['W', 'Z']]

Unnamed: 0,W,Z
A,0.516399,0.171522
B,0.685277,0.893613
C,0.721544,0.352132
D,0.181892,0.232354
E,0.083561,0.276239


In [42]:
df['new'] = df['W'] + df['Y']

In [43]:
df

Unnamed: 0,W,X,Y,Z,new
A,0.516399,0.570668,0.028474,0.171522,0.544873
B,0.685277,0.833897,0.306966,0.893613,0.992243
C,0.721544,0.189939,0.554228,0.352132,1.275771
D,0.181892,0.785602,0.965483,0.232354,1.147376
E,0.083561,0.603548,0.728993,0.276239,0.812554


In [45]:
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,0.028474,0.171522
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,0.189939,0.554228,0.352132
D,0.181892,0.785602,0.965483,0.232354
E,0.083561,0.603548,0.728993,0.276239


In [46]:
df

Unnamed: 0,W,X,Y,Z,new
A,0.516399,0.570668,0.028474,0.171522,0.544873
B,0.685277,0.833897,0.306966,0.893613,0.992243
C,0.721544,0.189939,0.554228,0.352132,1.275771
D,0.181892,0.785602,0.965483,0.232354,1.147376
E,0.083561,0.603548,0.728993,0.276239,0.812554


In [47]:
df.drop('new', axis=1, inplace=True)

In [48]:
df

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,0.028474,0.171522
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,0.189939,0.554228,0.352132
D,0.181892,0.785602,0.965483,0.232354
E,0.083561,0.603548,0.728993,0.276239


#### Selecting Dataframe Rows

In [49]:
df.loc['A']

W    0.516399
X    0.570668
Y    0.028474
Z    0.171522
Name: A, dtype: float64

In [50]:
df.iloc[2]

W    0.721544
X    0.189939
Y    0.554228
Z    0.352132
Name: C, dtype: float64

#### Selecting Subset of Rows and Columns

In [51]:
df.iloc[2:,2:]

Unnamed: 0,Y,Z
C,0.554228,0.352132
D,0.965483,0.232354
E,0.728993,0.276239


In [56]:
df.loc[['A','B'],['W', 'Y']]

Unnamed: 0,W,Y
A,0.516399,0.028474
B,0.685277,0.306966


#### Condition data selection within Dataframe

In [58]:
df > 0.5

Unnamed: 0,W,X,Y,Z
A,True,True,False,False
B,True,True,False,True
C,True,False,True,False
D,False,True,True,False
E,False,True,True,False


In [59]:
df[df>0.5]

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,,
B,0.685277,0.833897,,0.893613
C,0.721544,,0.554228,
D,,0.785602,0.965483,
E,,0.603548,0.728993,


In [60]:
df['W'] > 0.5

A     True
B     True
C     True
D    False
E    False
Name: W, dtype: bool

In [61]:
df[df['W'] > 0.5]

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,0.028474,0.171522
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,0.189939,0.554228,0.352132


In [63]:
df[df['W'] > 0.5][['W', 'X']]

Unnamed: 0,W,X
A,0.516399,0.570668
B,0.685277,0.833897
C,0.721544,0.189939


In [66]:
df[(df['W'] > 0.5) & (df['Z']>0.5)] # Super important syntax (and, or won't work)

Unnamed: 0,W,X,Y,Z
B,0.685277,0.833897,0.306966,0.893613


In [67]:
df[(df['W'] > 0.5) | (df['Z']>0.5)] # Super important syntax (and, or won't work)

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,0.028474,0.171522
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,0.189939,0.554228,0.352132


#### Handling missing data

In [68]:
d = {'A': [1, 2, np.nan], 'B': [5, np.nan, np.nan], 'C': [1, 2, 3]}

In [69]:
df = pd.DataFrame(d)

In [70]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [73]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [75]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [76]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [79]:
df.fillna('empty')

Unnamed: 0,A,B,C
0,1,5,1
1,2,empty,2
2,empty,empty,3


In [81]:
df.fillna(df.mean())

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,5.0,2
2,1.5,5.0,3
