# Work with Pandas

##### https://realpython.com/pandas-dataframe/

In [2]:
import pandas as pd
import numpy as np
import datetime

## Sample Dataframe

In [6]:
# df_m = pd.DataFrame(data={'col1': [1.55, 2.55], 'col2': [1.55, 2.55]})
# df_m

# Round(2)
df_m = pd.DataFrame(np.array([[1.454, 2.555, 3.656], [4.454, 5.555, 6.656]]), 
                    columns=['A', 'B', 'C'])
df_m.round(decimals=2)

Unnamed: 0,A,B,C
0,1.45,2.56,3.66
1,4.45,5.56,6.66


In [None]:
# Round up
df_m = pd.DataFrame(np.array([[1.454, 2.555, 3.656], [4.454, 5.555, 6.656]]), 
                    columns=['A', 'B', 'C'])
df_m.apply(np.ceil)

In [None]:
# Round down
df_m = pd.DataFrame(np.array([[1.454, 2.555, 3.656], [4.454, 5.555, 6.656]]), 
                    columns=['A', 'B', 'C'])
df_m.apply(np.floor)

## Dataframe with random values

In [None]:
rand_n = np.random.randint(0, 1_000)

rand_data = []

for x in range(0, rand_n):
    dollars = np.random.randint(1_000, 10_000)
    # print(dollars)
    rand_tmp_data = {
        'Name': f'Name-{x}',
        'Salary': f'${dollars:,.2f}'
        } 
    rand_data.append(rand_tmp_data)

rand_n
# rand_data

In [None]:
rand_df = pd.DataFrame(rand_data)
rand_df

## Demo

In [3]:
n = 5

data = [x for x in range(n)]
data

[0, 1, 2, 3, 4]

In [4]:
data = [{'row_num': x, 'timestamp': datetime.datetime.now(), 'added_by': 'Ruzlim'} for x in range(n)]
data

[{'row_num': 0,
  'timestamp': datetime.datetime(2023, 6, 8, 19, 54, 48, 487699),
  'added_by': 'Ruzlim'},
 {'row_num': 1,
  'timestamp': datetime.datetime(2023, 6, 8, 19, 54, 48, 487702),
  'added_by': 'Ruzlim'},
 {'row_num': 2,
  'timestamp': datetime.datetime(2023, 6, 8, 19, 54, 48, 487702),
  'added_by': 'Ruzlim'},
 {'row_num': 3,
  'timestamp': datetime.datetime(2023, 6, 8, 19, 54, 48, 487702),
  'added_by': 'Ruzlim'},
 {'row_num': 4,
  'timestamp': datetime.datetime(2023, 6, 8, 19, 54, 48, 487703),
  'added_by': 'Ruzlim'}]

## Preview Data

In [5]:
sdf = pd.DataFrame(data)
sdf

Unnamed: 0,row_num,timestamp,added_by
0,0,2023-06-08 19:54:48.487699,Ruzlim
1,1,2023-06-08 19:54:48.487702,Ruzlim
2,2,2023-06-08 19:54:48.487702,Ruzlim
3,3,2023-06-08 19:54:48.487702,Ruzlim
4,4,2023-06-08 19:54:48.487703,Ruzlim


In [None]:
sdf.shape

In [None]:
sdf.columns

In [None]:
sdf.index

In [None]:
sdf.dtypes

In [None]:
sdf.describe()

In [None]:
sdf.ndim

In [None]:
sdf.size

In [None]:
sdf.memory_usage()

In [6]:
# Check null value in each column
sdf.apply(lambda x: sum(x.isnull()), axis=0)

row_num      0
timestamp    0
added_by     0
dtype: int64

In [7]:
sdf.isnull()

Unnamed: 0,row_num,timestamp,added_by
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False


In [8]:
pd.isna(sdf)

Unnamed: 0,row_num,timestamp,added_by
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False


In [9]:
sdf[sdf['added_by'].notna()].reset_index()

Unnamed: 0,index,row_num,timestamp,added_by
0,0,0,2023-06-08 19:54:48.487699,Ruzlim
1,1,1,2023-06-08 19:54:48.487702,Ruzlim
2,2,2,2023-06-08 19:54:48.487702,Ruzlim
3,3,3,2023-06-08 19:54:48.487702,Ruzlim
4,4,4,2023-06-08 19:54:48.487703,Ruzlim


In [10]:
sdf.dropna(subset='added_by').reset_index()

Unnamed: 0,index,row_num,timestamp,added_by
0,0,0,2023-06-08 19:54:48.487699,Ruzlim
1,1,1,2023-06-08 19:54:48.487702,Ruzlim
2,2,2,2023-06-08 19:54:48.487702,Ruzlim
3,3,3,2023-06-08 19:54:48.487702,Ruzlim
4,4,4,2023-06-08 19:54:48.487703,Ruzlim


In [None]:
sdf.drop_duplicates(['timestamp'])

In [None]:
sdf.loc[:, ['added_by','timestamp']].drop_duplicates(['timestamp'])

In [None]:
# sdf.pivot(index='row_num', columns='timestamp', values='added_by')
sdf.pivot(index='timestamp', columns='row_num', values='added_by')

In [None]:
sdf_ = sdf.astype(dtype={'row_num': np.float32})
sdf_.dtypes

In [None]:
# df_rev_day_last_mth.drop('TM_KEY_DAY', axis=1, inplace=True)
# del df_rev_day_last_mth['TM_KEY_DAY']

In [None]:
# df.head
sdf.head(3)

In [None]:
sdf.tail()

## loc

In [None]:
sdf.loc[2]

In [None]:
sdf.loc[:2, ['row_num', 'timestamp']]

## iloc

In [None]:
sdf.iloc[:2, 0:2]

In [None]:
sdf.iloc[:2, [0, 2]]

## Filter

In [None]:
# Equal : ==
sdf[sdf['row_num'] == 3]

In [None]:
# Not Equal : ~
sdf[~(sdf['row_num'] == 3)]

In [None]:
# AND : &
sdf[(sdf['row_num'] >= 2) & (sdf['row_num'] < 4)]

In [None]:
# OR : |
sdf[(sdf['row_num'] == 1) | (sdf['row_num'] == 3)]

In [None]:
# XOR : ^ (???)
sdf[(sdf['row_num'] == 1) ^ (sdf['row_num'] == 3)]

In [None]:
# where
sdf.where(cond = sdf['row_num'] >= 3, other = 0)

## Filling missing data

In [15]:
sdf_ = sdf.where(cond = sdf['row_num'] == 2)
sdf_

Unnamed: 0,row_num,timestamp,added_by
0,,NaT,
1,,NaT,
2,2.0,2023-06-06 21:42:10.359968,Ruzlim
3,,NaT,
4,,NaT,


In [16]:
sdf_.fillna(value=0)

Unnamed: 0,row_num,timestamp,added_by
0,0.0,0,0
1,0.0,0,0
2,2.0,2023-06-06 21:42:10.359968,Ruzlim
3,0.0,0,0
4,0.0,0,0


In [17]:
sdf_.fillna(method='ffill')

Unnamed: 0,row_num,timestamp,added_by
0,,NaT,
1,,NaT,
2,2.0,2023-06-06 21:42:10.359968,Ruzlim
3,2.0,2023-06-06 21:42:10.359968,Ruzlim
4,2.0,2023-06-06 21:42:10.359968,Ruzlim


In [18]:
sdf_.fillna(method='bfill')

Unnamed: 0,row_num,timestamp,added_by
0,2.0,2023-06-06 21:42:10.359968,Ruzlim
1,2.0,2023-06-06 21:42:10.359968,Ruzlim
2,2.0,2023-06-06 21:42:10.359968,Ruzlim
3,,NaT,
4,,NaT,


In [19]:
sdf_.dropna()

Unnamed: 0,row_num,timestamp,added_by
2,2.0,2023-06-06 21:42:10.359968,Ruzlim


## Iterating

In [None]:
for col_label, col in sdf.iteritems():
    print(col_label, col, sep='\n', end='\n\n')

In [None]:
for row in sdf.loc[:, ['row_num', 'added_by']].itertuples():
    print(row)

## Concat Dataframe

In [None]:
rdf1 = pd.DataFrame(np.random.rand(3, 2))
rdf1

In [None]:
rdf2 = pd.DataFrame(np.random.rand(3, 2))
rdf2

In [None]:
rdf_concat = pd.concat([rdf1, rdf2])
rdf_concat

## Merge Dataframe

In [None]:
df_a = pd.DataFrame({'key': ['one', 'two'], 'a_val': [1, 2]})
df_a

In [None]:
df_b = pd.DataFrame({'key': ['one', 'two', 'tree'], 'b_val': [3, 4, 5]})
df_b

In [None]:
pd.merge(df_a, df_b, on='key')

In [None]:
pd.merge(df_b, df_a, on='key')