### Pandas
Software library written for the Python programming language for data manipulation and analysis. 

In [89]:
# import libraries
import numpy as np
import pandas as pd

In [90]:
# reading a csv file
csv_file_df = pd.read_csv("./sample.csv")
print(type(csv_file_df))
# print first n rows to see the datset structure
print(csv_file_df.head(n=5))

<class 'pandas.core.frame.DataFrame'>
         x1        x2
0 -0.715366  3.193351
1  0.505650  3.872541
2  2.171957  5.251319
3  0.354529  3.679159
4 -0.398294  3.484915


### Important attributes and member functions of a dataframe

In [91]:
# number of rows and columns
print(f'shape of the dataframe: {csv_file_df.shape}') 

shape of the dataframe: (5000, 2)


In [92]:
# all info about the dataframe
print(csv_file_df.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x1      5000 non-null   float64
 1   x2      5000 non-null   float64
dtypes: float64(2)
memory usage: 78.2 KB
None


In [93]:
# get a specific column with a given key (first 5)
print(csv_file_df['x2'].head())
print("Both usages are the same") if csv_file_df['x2'] is csv_file_df.x2 else print("Both usages are not the same") 

0    3.193351
1    3.872541
2    5.251319
3    3.679159
4    3.484915
Name: x2, dtype: float64
Both usages are the same


In [94]:
# to select multiple column names, pass the column names as a list (first 5)
print(csv_file_df[['x2', 'x1']].head())

         x2        x1
0  3.193351 -0.715366
1  3.872541  0.505650
2  5.251319  2.171957
3  3.679159  0.354529
4  3.484915 -0.398294


### Slicing and Indexing (iloc and loc)

In [95]:
# slicing using integer based locations of rows and columns
# to select rows from 5-10 (inclusive), and the second column (x2)
req_rows_and_columns = csv_file_df.iloc[5:10, 1]
print(req_rows_and_columns)

5    1.918944
6    4.113065
7    3.566363
8    3.366401
9    2.321018
Name: x2, dtype: float64


In [96]:
# to select rows from 5, 6, 10 (inclusive), and the first column (x2)
req_rows_and_columns = csv_file_df.iloc[[5, 6, 10], 0]
print(req_rows_and_columns)

5     0.000725
6     2.137332
10    0.222858
Name: x1, dtype: float64


In [97]:
# The same can be achieved using iloc, which uses labels of row and column to select
# here row labels are automatically generated  numbers from zero to number_of_rows-1
# to select rows from 5-10 (inclusive), and the second column (x2)
req_rows_and_columns = csv_file_df.loc[5:10, 'x2']
print(req_rows_and_columns)

5     1.918944
6     4.113065
7     3.566363
8     3.366401
9     2.321018
10    3.218724
Name: x2, dtype: float64


In [98]:
# to select rows from 5, 6, 10 (inclusive), and the first column (x2)
req_rows_and_columns = csv_file_df.loc[[5, 6, 10], 'x1':'x2']
print(req_rows_and_columns)

          x1        x2
5   0.000725  1.918944
6   2.137332  4.113065
10  0.222858  3.218724


In [99]:
# the row labels here are the auto genertad indice, you can manually set any column as the row labels
modified_row_label_df = pd.read_csv("./sample.csv", index_col='x1')
print(modified_row_label_df.head(2))

                 x2
x1                 
-0.715366  3.193351
 0.505650  3.872541


In [100]:
modified_row_label_df = csv_file_df.set_index('x1')
print(modified_row_label_df.head(2))

                 x2
x1                 
-0.715366  3.193351
 0.505650  3.872541


### Filtering

In [101]:
# filtering all rows where x <= 0.1 and x2 > 2.0
filtered = csv_file_df[(csv_file_df['x1'] <= 0.1) & (csv_file_df['x2'] > 2.0)]
print(f'filtered {filtered.shape[0]} out of {csv_file_df.shape[0]}')

filtered 817 out of 5000


In [102]:
# alternate approach
filtered = csv_file_df.loc[(csv_file_df['x1'] <= 0.1) & (csv_file_df['x2'] > 2.0)]
print(f'filtered {filtered.shape[0]} out of {csv_file_df.shape[0]}')

filtered 817 out of 5000


In [104]:
# drop na
csv_file_df.dropna(axis='index', how='any', subset=['x2']).head(2) # removes rows with na values for x2 column, have parameters like axis and how

Unnamed: 0,x1,x2
0,-0.715366,3.193351
1,0.50565,3.872541


### Convert to numpy

In [88]:
csv_numpy = csv_file_df.loc[1:5, 'x1'].to_numpy()
print(csv_numpy, type(csv_numpy))

[ 5.05650167e-01  2.17195684e+00  3.54528895e-01 -3.98294242e-01
  7.24929000e-04] <class 'numpy.ndarray'>


### os, operating system dependent functionalities

In [105]:
import os

In [106]:
# get the current directory (can we used to get absolute path automatically)
print(os.getcwd())

/home/akshay/Downloads/mlai_ta/recitation_3


In [112]:
# to change directory (be careful with this command, better to write relative paths)
os.chdir('/home/')
print(os.getcwd())
os.chdir('/home/akshay/Downloads/mlai_ta/recitation_3')
print(os.getcwd())

/home
/home/akshay/Downloads/mlai_ta/recitation_3


In [114]:
# list all files in a folder, argument relative to absolute path
list_of_files = os.listdir('../../../')
print(list_of_files)

['ps-4.py', '.anydesk', 'DL-Based-Point-Cloud-Registration', '.jupyter', '.FBReader', 'Desktop', '.thunderbird', 'rtl8723de', '.ros', '.gnome', '.leetcode', '.kite', '.cache', '.gazebo', '.pki', '.conda', 'result_screenshot_25.08.2022.png', 'Templates', '.java', 'Strange_Team_Name_Here', '.config', '.lc', 'Videos', '.sdformat', 'Cylinder3D', '.ignition', '.platformio', '.ssr', '.vscode', '.nv', '.zoom', 'ups-stitched.jpg', 'MinkowskiEngine', '.gnupg', '.rviz', 'Music', 'MAIL', 'Taylor_Dunn_Cart2', 'Downloads', '.sudo_as_admin_successful', 'Arduino', 'opencv_build', 'aws', 'Documents', '.ssh', '.docker', 'Pictures', '.mozilla', 'snap', '.bash_history', 'images', '.ipynb_checkpoints', '.gitconfig', '.eclipse', 'cv_stitch.py', '.local', '.bashrc', '.aws', '.ipython', 'Adversarial-Point-Cloud', 'SalsaNext', '.oracle_jre_usage', 'anaconda3', '.netrc', '.continuum', '.python_history', 'ups_result.json', '.wget-hsts', 'example', 'Public', '.matlab', '.keras', '.arduino15', 'spvnas', 'verify',

### shutil, high level file operations

In [116]:
# to delete a directory
os.rmdir('../trial/') # only works for empty directories

OSError: [Errno 39] Directory not empty: '../trial/'

In [117]:
import shutil
if shutil.rmtree('../trial/'):
    print("Removed")

In [None]:
# to delete a file
if os.remove('../recitation_3/sample1.csv'):
    print("Removed file successfully")

In [119]:
# to check if a file exists
if os.path.exists("../trial/"):
    print("The file/folder exists")

The file exists


In [120]:
# to create a directory,
# 1. check if it exits, then if already exits either overwrite or exit the program according to your need
# you need to create folder named sample in this directory
req_path = './sample/'
if os.path.exists(req_path):
    shutil.rmtree(req_path)
    print("Found existing path, so deleting the folder")
os.mkdir(req_path)

In [121]:
shutil.rmtree(req_path)

### glob, The glob module finds all the pathnames matching a specified pattern 
Need for this
1. You want to load all .png files for a dl or ml pipeline
2. split dataset into training and testing

For all these you require the filenames

In [122]:
import glob

In [133]:
# search in a specific folder
# arguments should be a pathname string to match
# 1. * matches to anythin
# 2. ** searches all subdiretories possible (recursive = true)

# search for png files in Testing/glioma_tumor/
for i, filename in enumerate(glob.glob('../../archive/Testing/glioma_tumor/*.jpg')):
    continue
print(f"found these {i} image files in Testing/glioma_tumor/ folder")

found these 99 image files


In [134]:
# search for png files in Testing/ 
for i, filename in enumerate(glob.glob('../../archive/Testing/*/*.jpg')):
    continue
print(f"found these {i} image files in testing foler")

found these 393 image files


In [131]:
# search in all subdirectories of archive (we do need to know the level of the deepest folder
# ** will take care of it)
for i, filename in enumerate(glob.glob('../../archive/**/*.jpg', recursive=True)):
    # print(filename)
    continue
print(f'Found {i} image files in archive folder')

Found 3263 png files


### argparse, for passing command line arguments refer in argparse.py