In [15]:
# series

# imports
import numpy as np
import pandas as pd

# Creating labels and data
labels = ['a', 'b', 'c']
my_data = [10, 20, 30]


# Creating Series
def create_series(data, index=None):
    return pd.Series(data, index=index)


# Printing Series
print(create_series(my_data))
print(create_series(my_data, labels))

# Series with countries
ser = create_series([1, 2, 3, 4], ['USA', 'Germany', 'USSR', 'Japan'])
print(ser)

# Accessing Series by key
print(ser['USA'])

# Creating DataFrame demo
df = pd.DataFrame({
    'Country': ['USA', 'Germany', 'Italy', 'Japan'],
    'Values': [1, 2, 5, 4]
})
print(df)
df

0    10
1    20
2    30
dtype: int64
a    10
b    20
c    30
dtype: int64
USA        1
Germany    2
USSR       3
Japan      4
dtype: int64
1
   Country  Values
0      USA       1
1  Germany       2
2    Italy       5
3    Japan       4


Unnamed: 0,Country,Values
0,USA,1
1,Germany,2
2,Italy,5
3,Japan,4


## Dataframes - Part 1


In [16]:
from numpy.random import randn

np.random.seed(101)


In [17]:
df = pd.DataFrame(randn(5, 4), ['A', 'B', 'C', 'D', 'E'], ['W', 'X', 'Y', 'Z'])
print(df)

          W         X         Y         Z
A  2.706850  0.628133  0.907969  0.503826
B  0.651118 -0.319318 -0.848077  0.605965
C -2.018168  0.740122  0.528813 -0.589001
D  0.188695 -0.758872 -0.933237  0.955057
E  0.190794  1.978757  2.605967  0.683509


In [18]:
w_column = df['W']
print(w_column)
print(type(w_column))

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64
<class 'pandas.core.series.Series'>


In [21]:
df['AA'] = df['W'] + df['Y']

In [23]:
df.drop('new', axis=1, inplace=True)

In [24]:
df

Unnamed: 0,W,X,Y,Z,AA
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


If we use `inplace` on `df.drop` we are changing the original dataframe. If we don't use `inplace` we are not changing the original dataframe. We are just returning a new dataframe.

```python
df.drop('column_name', axis=1, inplace=True)
```


In [25]:
df.drop('E', axis=0)

Unnamed: 0,W,X,Y,Z,AA
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542


In [27]:
df.shape

(5, 5)

In [28]:
df[['Z', 'X']]

Unnamed: 0,Z,X
A,0.503826,0.628133
B,0.605965,-0.319318
C,-0.589001,0.740122
D,0.955057,-0.758872
E,0.683509,1.978757


In [29]:
df.loc['A']


W     2.706850
X     0.628133
Y     0.907969
Z     0.503826
AA    3.614819
Name: A, dtype: float64

In [30]:
df.iloc[2]

W    -2.018168
X     0.740122
Y     0.528813
Z    -0.589001
AA   -1.489355
Name: C, dtype: float64

In [31]:
df.loc['B', 'Y']

-0.8480769834036315

In [32]:
df.loc[['A', 'B'], ['W', 'Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


## Dataframes - Part 2


In [33]:
df

Unnamed: 0,W,X,Y,Z,AA
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [34]:
df.drop('AA', axis=1, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [35]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [36]:
bool_df = df > 0
df[bool_df]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [37]:
df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [38]:
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [40]:
result_df = df[df['Z'] < 0]

In [41]:
result_df

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [42]:
result_df['X']

C    0.740122
Name: X, dtype: float64

In [43]:
df[df['W'] > 0]['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [44]:
df[df['W'] > 0][['Y', 'X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [45]:
df[(df['W'] > 0) & (df['Y'] > 1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [46]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [47]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [48]:
new_ind = 'CA NY WY OR CO'.split()
df['States'] = new_ind

In [49]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [50]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


In [51]:
df.reset_index(inplace=True)

In [52]:
df

Unnamed: 0,index,W,X,Y,Z,States
0,A,2.70685,0.628133,0.907969,0.503826,CA
1,B,0.651118,-0.319318,-0.848077,0.605965,NY
2,C,-2.018168,0.740122,0.528813,-0.589001,WY
3,D,0.188695,-0.758872,-0.933237,0.955057,OR
4,E,0.190794,1.978757,2.605967,0.683509,CO


## Dataframes - Part 3


In [53]:
outside = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
inside = [1, 2, 3, 1, 2, 3]
hier_index = list(zip(outside, inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [54]:
outside

['G1', 'G1', 'G1', 'G2', 'G2', 'G2']

In [55]:
inside

[1, 2, 3, 1, 2, 3]

In [56]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [57]:
df = pd.DataFrame(randn(6, 2), hier_index, ['A', 'B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [58]:
df.loc['G1']

Unnamed: 0,A,B
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [59]:
df.loc['G2'].loc[1]

A    0.166905
B    0.184502
Name: 1, dtype: float64

In [60]:
df.index.names = ['Groups', 'Num']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [61]:
df.loc['G2'].loc[2]['B']

0.07295967531703869

In [62]:
df.loc['G2'].loc[1]['A']

0.16690463609281317

In [63]:
df.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [64]:
df.xs(1, level='Num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.302665,1.693723
G2,0.166905,0.184502


## Missing Data

```python
df.dropna()
df.dropna(axis=1)
df.dropna(thresh=2)
df.dropna(subset=['column_name'])
df.fillna(value='FILL VALUE')
df['column_name'].fillna(value=df['column_name'].mean())
```


In [65]:
d = {'A': [1, 2, np.nan], 'B': [5, np.nan, np.nan], 'C': [1, 2, 3]}

In [66]:
d

{'A': [1, 2, nan], 'B': [5, nan, nan], 'C': [1, 2, 3]}

In [67]:
d = pd.DataFrame(d)

In [68]:
d

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3
