# Work with Pandas

### https://realpython.com/pandas-dataframe/

In [2]:
import pandas as pd
import numpy as np
import datetime

## Sample Dataframe

In [22]:
# df_m = pd.DataFrame(data={'col1': [1.55, 2.55], 'col2': [1.55, 2.55]})
# df_m

# Round(2)
df_m = pd.DataFrame(np.array([[1.454, 2.555, 3.656], [4.454, 5.555, 6.656]]), 
                    columns=['A', 'B', 'C'])
df_m.round(decimals=2)

Unnamed: 0,A,B,C
0,1.45,2.56,3.66
1,4.45,5.56,6.66


In [23]:
# Round up
df_m = pd.DataFrame(np.array([[1.454, 2.555, 3.656], [4.454, 5.555, 6.656]]), 
                    columns=['A', 'B', 'C'])
df_m.apply(np.ceil)

Unnamed: 0,A,B,C
0,2.0,3.0,4.0
1,5.0,6.0,7.0


In [24]:
# Round down
df_m = pd.DataFrame(np.array([[1.454, 2.555, 3.656], [4.454, 5.555, 6.656]]), 
                    columns=['A', 'B', 'C'])
df_m.apply(np.floor)

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,4.0,5.0,6.0


In [3]:
n = 5

data = [x for x in range(n)]
data

[0, 1, 2, 3, 4]

In [4]:
data = [{'row_num': x, 'timestamp': datetime.datetime.now(), 'added_by': 'Ruzlim'} for x in range(n)]
data

[{'row_num': 0,
  'timestamp': datetime.datetime(2023, 5, 23, 21, 15, 31, 95226),
  'added_by': 'Ruzlim'},
 {'row_num': 1,
  'timestamp': datetime.datetime(2023, 5, 23, 21, 15, 31, 95229),
  'added_by': 'Ruzlim'},
 {'row_num': 2,
  'timestamp': datetime.datetime(2023, 5, 23, 21, 15, 31, 95229),
  'added_by': 'Ruzlim'},
 {'row_num': 3,
  'timestamp': datetime.datetime(2023, 5, 23, 21, 15, 31, 95230),
  'added_by': 'Ruzlim'},
 {'row_num': 4,
  'timestamp': datetime.datetime(2023, 5, 23, 21, 15, 31, 95230),
  'added_by': 'Ruzlim'}]

In [None]:
rand_n = np.random.randint(0, 1_000)

rand_data = []

for x in range(0, rand_n):
    dollars = np.random.randint(1_000, 10_000)
    # print(dollars)
    rand_tmp_data = {
        'Name': f'Name-{x}',
        'Salary': f'${dollars:,.2f}'
        } 
    rand_data.append(rand_tmp_data)

rand_n
# rand_data

In [5]:
sdf = pd.DataFrame(data)
sdf

Unnamed: 0,row_num,timestamp,added_by
0,0,2023-05-23 21:15:31.095226,Ruzlim
1,1,2023-05-23 21:15:31.095229,Ruzlim
2,2,2023-05-23 21:15:31.095229,Ruzlim
3,3,2023-05-23 21:15:31.095230,Ruzlim
4,4,2023-05-23 21:15:31.095230,Ruzlim


In [None]:
rand_df = pd.DataFrame(rand_data)
rand_df

In [6]:
sdf.index

RangeIndex(start=0, stop=5, step=1)

In [7]:
sdf.columns

Index(['row_num', 'timestamp', 'added_by'], dtype='object')

In [None]:
sdf['added_by'].unique()

In [8]:
sdf.dtypes

row_num               int64
timestamp    datetime64[ns]
added_by             object
dtype: object

In [10]:
# sdf.pivot(index='row_num', columns='timestamp', values='added_by')
sdf.pivot(index='timestamp', columns='row_num', values='added_by')

row_num,0,1,2,3,4
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-05-23 21:15:31.095226,Ruzlim,,,,
2023-05-23 21:15:31.095229,,Ruzlim,Ruzlim,,
2023-05-23 21:15:31.095230,,,,Ruzlim,Ruzlim


In [None]:
sdf.isnull()

In [None]:
pd.isna(sdf)

In [None]:
sdf_ = sdf.astype(dtype={'row_num': np.float32})
sdf_.dtypes

In [None]:
sdf.ndim

In [None]:
sdf.shape

In [None]:
sdf.size

In [None]:
sdf.memory_usage()

In [None]:
sdf.describe()

In [11]:
# df.head
sdf.head(3)

Unnamed: 0,row_num,timestamp,added_by
0,0,2023-05-23 21:15:31.095226,Ruzlim
1,1,2023-05-23 21:15:31.095229,Ruzlim
2,2,2023-05-23 21:15:31.095229,Ruzlim


In [None]:
sdf.tail()

## loc

In [12]:
sdf.loc[2]

row_num                               2
timestamp    2023-05-23 21:15:31.095229
added_by                         Ruzlim
Name: 2, dtype: object

In [13]:
sdf.loc[:2, ['row_num', 'timestamp']]

Unnamed: 0,row_num,timestamp
0,0,2023-05-23 21:15:31.095226
1,1,2023-05-23 21:15:31.095229
2,2,2023-05-23 21:15:31.095229


## iloc

In [14]:
sdf.iloc[:2, 0:2]

Unnamed: 0,row_num,timestamp
0,0,2023-05-23 21:15:31.095226
1,1,2023-05-23 21:15:31.095229


In [15]:
sdf.iloc[:2, [0, 2]]

Unnamed: 0,row_num,added_by
0,0,Ruzlim
1,1,Ruzlim


## Filter

In [None]:
# Equal : ==
sdf[sdf['row_num'] == 3]

In [None]:
# Not Equal : ~
sdf[~(sdf['row_num'] == 3)]

In [None]:
# AND : &
sdf[(sdf['row_num'] >= 2) & (sdf['row_num'] < 4)]

In [None]:
# OR : |
sdf[(sdf['row_num'] == 1) | (sdf['row_num'] == 3)]

In [None]:
# XOR : ^ (???)
sdf[(sdf['row_num'] == 1) ^ (sdf['row_num'] == 3)]

In [None]:
# where
sdf.where(cond = sdf['row_num'] >= 3, other = 0)

## Filling missing data

In [None]:
sdf_ = sdf.where(cond = sdf['row_num'] == 2)
sdf_

In [None]:
sdf_.fillna(value=0)

In [None]:
sdf_.fillna(method='ffill')

In [None]:
sdf_.fillna(method='bfill')

In [None]:
sdf_.dropna()

## Iterating

In [None]:
for col_label, col in sdf.iteritems():
    print(col_label, col, sep='\n', end='\n\n')

In [None]:
for row in sdf.loc[:, ['row_num', 'added_by']].itertuples():
    print(row)

## Concat Dataframe

In [None]:
rdf1 = pd.DataFrame(np.random.rand(3, 2))
rdf1

In [None]:
rdf2 = pd.DataFrame(np.random.rand(3, 2))
rdf2

In [None]:
rdf_concat = pd.concat([rdf1, rdf2])
rdf_concat

## Merge Dataframe

In [None]:
df_a = pd.DataFrame({'key': ['one', 'two'], 'a_val': [1, 2]})
df_a

In [None]:
df_b = pd.DataFrame({'key': ['one', 'two', 'tree'], 'b_val': [3, 4, 5]})
df_b

In [None]:
pd.merge(df_a, df_b, on='key')

In [None]:
pd.merge(df_b, df_a, on='key')