# Work with Pandas

##### https://realpython.com/pandas-dataframe/

In [1]:
import pandas as pd
import numpy as np
import datetime

## Sample Dataframe

In [2]:
# df_m = pd.DataFrame(data={'col1': [1.55, 2.55], 'col2': [1.55, 2.55]})
# df_m

# Round(2)
df_m = pd.DataFrame(np.array([[1.454, 2.555, 3.656], [4.454, 5.555, 6.656]]), 
                    columns=['A', 'B', 'C'])
df_m.round(decimals=2)

Unnamed: 0,A,B,C
0,1.45,2.56,3.66
1,4.45,5.56,6.66


In [3]:
# Round up
df_m = pd.DataFrame(np.array([[1.454, 2.555, 3.656], [4.454, 5.555, 6.656]]), 
                    columns=['A', 'B', 'C'])
df_m.apply(np.ceil)

Unnamed: 0,A,B,C
0,2.0,3.0,4.0
1,5.0,6.0,7.0


In [4]:
# Round down
df_m = pd.DataFrame(np.array([[1.454, 2.555, 3.656], [4.454, 5.555, 6.656]]), 
                    columns=['A', 'B', 'C'])
df_m.apply(np.floor)

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,4.0,5.0,6.0


## Dataframe with random values

In [5]:
rand_n = np.random.randint(0, 1_000)

rand_data = []

for x in range(0, rand_n):
    dollars = np.random.randint(1_000, 10_000)
    # print(dollars)
    rand_tmp_data = {
        'Name': f'Name-{x}',
        'Salary': f'${dollars:,.2f}'
        } 
    rand_data.append(rand_tmp_data)

rand_n
# rand_data

639

In [6]:
rand_df = pd.DataFrame(rand_data)
rand_df

Unnamed: 0,Name,Salary
0,Name-0,"$7,365.00"
1,Name-1,"$5,982.00"
2,Name-2,"$9,076.00"
3,Name-3,"$7,847.00"
4,Name-4,"$6,932.00"
...,...,...
634,Name-634,"$4,398.00"
635,Name-635,"$2,665.00"
636,Name-636,"$5,637.00"
637,Name-637,"$7,109.00"


## Demo

In [7]:
n = 5

data = [x for x in range(n)]
data

[0, 1, 2, 3, 4]

In [10]:
data = [{'row_num': x, 'timestamp': datetime.datetime.now(), 'added_by': 'Ruzlim'} for x in range(n)]
data

[{'row_num': 0,
  'timestamp': datetime.datetime(2024, 3, 6, 12, 46, 40, 347135),
  'added_by': 'Ruzlim'},
 {'row_num': 1,
  'timestamp': datetime.datetime(2024, 3, 6, 12, 46, 40, 347135),
  'added_by': 'Ruzlim'},
 {'row_num': 2,
  'timestamp': datetime.datetime(2024, 3, 6, 12, 46, 40, 347135),
  'added_by': 'Ruzlim'},
 {'row_num': 3,
  'timestamp': datetime.datetime(2024, 3, 6, 12, 46, 40, 347135),
  'added_by': 'Ruzlim'},
 {'row_num': 4,
  'timestamp': datetime.datetime(2024, 3, 6, 12, 46, 40, 347135),
  'added_by': 'Ruzlim'}]

## Preview Data

In [11]:
sdf = pd.DataFrame(data)
sdf

Unnamed: 0,row_num,timestamp,added_by
0,0,2024-03-06 12:46:40.347135,Ruzlim
1,1,2024-03-06 12:46:40.347135,Ruzlim
2,2,2024-03-06 12:46:40.347135,Ruzlim
3,3,2024-03-06 12:46:40.347135,Ruzlim
4,4,2024-03-06 12:46:40.347135,Ruzlim


In [12]:
sdf.shape

(5, 3)

In [13]:
sdf.columns

Index(['row_num', 'timestamp', 'added_by'], dtype='object')

In [14]:
sdf.index

RangeIndex(start=0, stop=5, step=1)

In [15]:
sdf.dtypes

row_num               int64
timestamp    datetime64[ns]
added_by             object
dtype: object

In [16]:
sdf.describe()

Unnamed: 0,row_num
count,5.0
mean,2.0
std,1.581139
min,0.0
25%,1.0
50%,2.0
75%,3.0
max,4.0


In [17]:
sdf.ndim

2

In [18]:
sdf.size

15

In [19]:
sdf.memory_usage()

Index        128
row_num       40
timestamp     40
added_by      40
dtype: int64

In [20]:
sdf

Unnamed: 0,row_num,timestamp,added_by
0,0,2024-03-06 12:46:40.347135,Ruzlim
1,1,2024-03-06 12:46:40.347135,Ruzlim
2,2,2024-03-06 12:46:40.347135,Ruzlim
3,3,2024-03-06 12:46:40.347135,Ruzlim
4,4,2024-03-06 12:46:40.347135,Ruzlim


In [21]:
# Check null value in each column
sdf.apply(lambda x: sum(x.isnull()), axis=0)

row_num      0
timestamp    0
added_by     0
dtype: int64

In [22]:
sdf.isnull()

Unnamed: 0,row_num,timestamp,added_by
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False


In [23]:
pd.isna(sdf)

Unnamed: 0,row_num,timestamp,added_by
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False


In [24]:
sdf[sdf['added_by'].notna()].reset_index()

Unnamed: 0,index,row_num,timestamp,added_by
0,0,0,2024-03-06 12:46:40.347135,Ruzlim
1,1,1,2024-03-06 12:46:40.347135,Ruzlim
2,2,2,2024-03-06 12:46:40.347135,Ruzlim
3,3,3,2024-03-06 12:46:40.347135,Ruzlim
4,4,4,2024-03-06 12:46:40.347135,Ruzlim


In [25]:
sdf.dropna(subset='added_by').reset_index()

Unnamed: 0,index,row_num,timestamp,added_by
0,0,0,2024-03-06 12:46:40.347135,Ruzlim
1,1,1,2024-03-06 12:46:40.347135,Ruzlim
2,2,2,2024-03-06 12:46:40.347135,Ruzlim
3,3,3,2024-03-06 12:46:40.347135,Ruzlim
4,4,4,2024-03-06 12:46:40.347135,Ruzlim


In [26]:
sdf.drop_duplicates(['timestamp'])

Unnamed: 0,row_num,timestamp,added_by
0,0,2024-03-06 12:46:40.347135,Ruzlim


In [27]:
sdf.loc[:, ['added_by','timestamp']].drop_duplicates(['timestamp'])

Unnamed: 0,added_by,timestamp
0,Ruzlim,2024-03-06 12:46:40.347135


In [28]:
# sdf.pivot(index='row_num', columns='timestamp', values='added_by')
sdf.pivot(index='timestamp', columns='row_num', values='added_by')

row_num,0,1,2,3,4
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-03-06 12:46:40.347135,Ruzlim,Ruzlim,Ruzlim,Ruzlim,Ruzlim


In [29]:
sdf_ = sdf.astype(dtype={'row_num': np.float32})
sdf_.dtypes

row_num             float32
timestamp    datetime64[ns]
added_by             object
dtype: object

In [30]:
# df_rev_day_last_mth.drop('TM_KEY_DAY', axis=1, inplace=True)
# del df_rev_day_last_mth['TM_KEY_DAY']

In [31]:
# df.head
sdf.head(3)

Unnamed: 0,row_num,timestamp,added_by
0,0,2024-03-06 12:46:40.347135,Ruzlim
1,1,2024-03-06 12:46:40.347135,Ruzlim
2,2,2024-03-06 12:46:40.347135,Ruzlim


In [32]:
sdf.tail()

Unnamed: 0,row_num,timestamp,added_by
0,0,2024-03-06 12:46:40.347135,Ruzlim
1,1,2024-03-06 12:46:40.347135,Ruzlim
2,2,2024-03-06 12:46:40.347135,Ruzlim
3,3,2024-03-06 12:46:40.347135,Ruzlim
4,4,2024-03-06 12:46:40.347135,Ruzlim


## loc

In [33]:
sdf.loc[2]

row_num                               2
timestamp    2024-03-06 12:46:40.347135
added_by                         Ruzlim
Name: 2, dtype: object

In [34]:
sdf.loc[:2, ['row_num', 'timestamp']]

Unnamed: 0,row_num,timestamp
0,0,2024-03-06 12:46:40.347135
1,1,2024-03-06 12:46:40.347135
2,2,2024-03-06 12:46:40.347135


## iloc

In [35]:
sdf.iloc[:2, 0:2]

Unnamed: 0,row_num,timestamp
0,0,2024-03-06 12:46:40.347135
1,1,2024-03-06 12:46:40.347135


In [36]:
sdf.iloc[:2, [0, 2]]

Unnamed: 0,row_num,added_by
0,0,Ruzlim
1,1,Ruzlim


## Filter

In [37]:
# Equal : ==
sdf[sdf['row_num'] == 3]

Unnamed: 0,row_num,timestamp,added_by
3,3,2024-03-06 12:46:40.347135,Ruzlim


In [38]:
# Not Equal : ~
sdf[~(sdf['row_num'] == 3)]

Unnamed: 0,row_num,timestamp,added_by
0,0,2024-03-06 12:46:40.347135,Ruzlim
1,1,2024-03-06 12:46:40.347135,Ruzlim
2,2,2024-03-06 12:46:40.347135,Ruzlim
4,4,2024-03-06 12:46:40.347135,Ruzlim


In [39]:
# AND : &
sdf[(sdf['row_num'] >= 2) & (sdf['row_num'] < 4)]

Unnamed: 0,row_num,timestamp,added_by
2,2,2024-03-06 12:46:40.347135,Ruzlim
3,3,2024-03-06 12:46:40.347135,Ruzlim


In [40]:
# OR : |
sdf[(sdf['row_num'] == 1) | (sdf['row_num'] == 3)]

Unnamed: 0,row_num,timestamp,added_by
1,1,2024-03-06 12:46:40.347135,Ruzlim
3,3,2024-03-06 12:46:40.347135,Ruzlim


In [41]:
# XOR : ^ (???)
sdf[(sdf['row_num'] == 1) ^ (sdf['row_num'] == 3)]

Unnamed: 0,row_num,timestamp,added_by
1,1,2024-03-06 12:46:40.347135,Ruzlim
3,3,2024-03-06 12:46:40.347135,Ruzlim


In [42]:
# where
sdf.where(cond = sdf['row_num'] >= 3, other = 0)

Unnamed: 0,row_num,timestamp,added_by
0,0,0,0
1,0,0,0
2,0,0,0
3,3,2024-03-06 12:46:40.347135,Ruzlim
4,4,2024-03-06 12:46:40.347135,Ruzlim


## Filling missing data

In [43]:
sdf_ = sdf.where(cond = sdf['row_num'] == 2)
sdf_

Unnamed: 0,row_num,timestamp,added_by
0,,NaT,
1,,NaT,
2,2.0,2024-03-06 12:46:40.347135,Ruzlim
3,,NaT,
4,,NaT,


In [44]:
sdf_.fillna(value=0)

Unnamed: 0,row_num,timestamp,added_by
0,0.0,0,0
1,0.0,0,0
2,2.0,2024-03-06 12:46:40.347135,Ruzlim
3,0.0,0,0
4,0.0,0,0


In [45]:
sdf_.fillna(method='ffill')

Unnamed: 0,row_num,timestamp,added_by
0,,NaT,
1,,NaT,
2,2.0,2024-03-06 12:46:40.347135,Ruzlim
3,2.0,2024-03-06 12:46:40.347135,Ruzlim
4,2.0,2024-03-06 12:46:40.347135,Ruzlim


In [46]:
sdf_.fillna(method='bfill')

Unnamed: 0,row_num,timestamp,added_by
0,2.0,2024-03-06 12:46:40.347135,Ruzlim
1,2.0,2024-03-06 12:46:40.347135,Ruzlim
2,2.0,2024-03-06 12:46:40.347135,Ruzlim
3,,NaT,
4,,NaT,


In [47]:
sdf_.dropna()

Unnamed: 0,row_num,timestamp,added_by
2,2.0,2024-03-06 12:46:40.347135,Ruzlim


## Iterating

In [48]:
for col_label, col in sdf.iteritems():
    print(col_label, col, sep='\n', end='\n\n')

row_num
0    0
1    1
2    2
3    3
4    4
Name: row_num, dtype: int64

timestamp
0   2024-03-06 12:46:40.347135
1   2024-03-06 12:46:40.347135
2   2024-03-06 12:46:40.347135
3   2024-03-06 12:46:40.347135
4   2024-03-06 12:46:40.347135
Name: timestamp, dtype: datetime64[ns]

added_by
0    Ruzlim
1    Ruzlim
2    Ruzlim
3    Ruzlim
4    Ruzlim
Name: added_by, dtype: object



  for col_label, col in sdf.iteritems():


In [49]:
for row in sdf.loc[:, ['row_num', 'added_by']].itertuples():
    print(row)

Pandas(Index=0, row_num=0, added_by='Ruzlim')
Pandas(Index=1, row_num=1, added_by='Ruzlim')
Pandas(Index=2, row_num=2, added_by='Ruzlim')
Pandas(Index=3, row_num=3, added_by='Ruzlim')
Pandas(Index=4, row_num=4, added_by='Ruzlim')


## Concat Dataframe

In [50]:
rdf1 = pd.DataFrame(np.random.rand(3, 2))
rdf1

Unnamed: 0,0,1
0,0.403088,0.09304
1,0.157693,0.205878
2,0.021072,0.273463


In [51]:
rdf2 = pd.DataFrame(np.random.rand(3, 2))
rdf2

Unnamed: 0,0,1
0,0.631264,0.980012
1,0.710027,0.117282
2,0.370423,0.012785


In [52]:
rdf_concat = pd.concat([rdf1, rdf2])
rdf_concat

Unnamed: 0,0,1
0,0.403088,0.09304
1,0.157693,0.205878
2,0.021072,0.273463
0,0.631264,0.980012
1,0.710027,0.117282
2,0.370423,0.012785


## Merge Dataframe

In [2]:
df_a = pd.DataFrame({'key': ['one', 'two'], 'a_val': [1, 2]})
df_a

Unnamed: 0,key,a_val
0,one,1
1,two,2


In [3]:
df_b = pd.DataFrame({'key': ['one', 'two', 'tree'], 'b_val': [3, 4, 5]})
df_b

Unnamed: 0,key,b_val
0,one,3
1,two,4
2,tree,5


In [4]:
pd.merge(df_a, df_b, on='key')

Unnamed: 0,key,a_val,b_val
0,one,1,3
1,two,2,4


In [5]:
pd.merge(df_b, df_a, on='key')

Unnamed: 0,key,b_val,a_val
0,one,3,1
1,two,4,2


## Group by

In [3]:
df_grp = pd.DataFrame({'A': ['one', 'one', 'two', 'tree', 'tree'],'B': range(5),'C': [6, 7, 8, 9, 10]})
df_grp

Unnamed: 0,A,B,C
0,one,0,6
1,one,1,7
2,two,2,8
3,tree,3,9
4,tree,4,10


In [7]:
df_pv = df_grp.groupby('A').agg({'B' : ['sum'], 'C': ['min', 'max']})
df_pv

Unnamed: 0_level_0,B,C,C
Unnamed: 0_level_1,sum,min,max
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
one,1,6,7
tree,7,9,10
two,2,8,8


In [26]:
cols_dict = {'sum':'foo', 'min':'bar', 'max':'bar2'}
frame1 = df_grp.groupby('A').agg({'B' : ['sum'], 'C': ['sum', 'min', 'max']}).rename(columns=cols_dict)
# frame1.columns.get_level_values(1)
# frame1.columns = frame1.columns.droplevel(0)
# frame = frame.reset_index()
frame1

Unnamed: 0_level_0,B,C,C,C
Unnamed: 0_level_1,foo,foo,bar,bar2
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,1,13,6,7
tree,7,19,9,10
two,2,8,8,8


In [36]:
cols_dict = {'sum':'foo', 'min':'bar', 'max':'bar2'}
frame = df_grp.groupby('A').agg({'B' : ['sum'], 'C': ['sum', 'min', 'max']}).rename(columns=cols_dict)
frame.columns = frame.columns.map('_'.join)
# frame.columns = frame.columns.map('_'.join).to_series().map(d)
# frame = frame.reset_index().rename(columns=cols_dict)
frame

Unnamed: 0_level_0,B_foo,C_foo,C_bar,C_bar2
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,1,13,6,7
tree,7,19,9,10
two,2,8,8,8


In [35]:
frame.columns[1:]

MultiIndex([('C',  'foo'),
            ('C',  'bar'),
            ('C', 'bar2')],
           )