In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Create empty, check for empty, size

In [3]:
# Strings
print("\n--- Strings ---")
string = ""
# or
string = ''
print("len(string) == 0 --> ", len(string) == 0)

# List
print("\n--- List ---")
lst = []
lst = list()
print("len(lst) == 0 --> ", len(lst) == 0)

# Tuple
print("\n--- Tuple ---")
tpl = ()
tpl = tuple()
print("len(tpl) == 0 --> ", len(tpl) == 0)

# Set
print("\n--- Set ---")
st = set()
print("len(st) == 0 --> ", len(st) == 0)

# Dictionary
print("\n--- Dictionary ---")
dct = {}
dct = dict()
print("len(dct) == 0 --> ", len(dct) == 0)

# Series
print("\n--- Series ---")
ser = pd.Series()
print("ser.empty --> ", ser.empty)
print("len(ser) --> ", len(ser))
print("ser.size --> ", ser.size)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
df = pd.DataFrame()
print("df.empty --> ", df.empty)
print("df.shape --> ", df.shape)
print("df.size --> ", df.size)
print("len(df) --> ", len(df))
print("df.count() --> ", df.count())

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print("len(np_arr) == 0 --> ", len(np_arr) == 0)


--- Strings ---
len(string) == 0 -->  True

--- List ---
len(lst) == 0 -->  True

--- Tuple ---
len(tpl) == 0 -->  True

--- Set ---
len(st) == 0 -->  True

--- Dictionary ---
len(dct) == 0 -->  True

--- Series ---
ser.empty -->  True
len(ser) -->  0
ser.size -->  0

--- Data Frame (df) ---
df.empty -->  True
df.shape -->  (0, 0)
df.size -->  0
len(df) -->  0
df.count() -->  Series([], dtype: int64)

--- Numpy ---
len(np_arr) == 0 -->  True


## Initialization

In [4]:
# Strings
print("\n--- Strings ---")
string = "A"
# or you can use f-strings
name = 'Kanchi'
string = f'Hello {name}'
# Multiline strings
mutli = '''mutli
line
strings
'''
escape_sequence1 = 'She\'s Indian'
escape_sequence2 = "She\"s Indian"
without_escape = "She's Indian" # <-- When you use double quotes to wrap the entire string, you don't need to escape the inner single quotes. And vice versa when using single quotes to wrap the outer string
print(string, mutli, escape_sequence1, escape_sequence2, without_escape)

# List
print("\n--- List ---")
lst = [1, 'c', 3.1, True]
print(lst)

# Tuple
print("\n--- Tuple ---")
tpl = (1, 'c', 3.1, True)
print(tpl)

# Set
print("\n--- Set ---")
st = {1, 'c', 3.1, True}
print(st)

# Dictionary
print("\n--- Dictionary ---")
dct = {'a': 1, 'b': 2}
print(dct)
dct = {'a': [1,2], 'b': [2,3]}
print(dct)

# Series
print("\n--- Series ---")
ser = pd.Series([1, 'c', 3.1, True], index=[0,1,2,3], name='A')
print(ser)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
df = pd.DataFrame({'C1': [1,2,3], 'C2': ['x', 'y', 'z']}, index=['a', 'b', 'c'])
print(df)

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
Hello Kanchi mutli
line
strings
 She's Indian She"s Indian She's Indian

--- List ---
[1, 'c', 3.1, True]

--- Tuple ---
(1, 'c', 3.1, True)

--- Set ---
{'c', 1, 3.1}

--- Dictionary ---
{'a': 1, 'b': 2}
{'a': [1, 2], 'b': [2, 3]}

--- Series ---
0       1
1       c
2     3.1
3    True
Name: A, dtype: object

--- Data Frame (df) ---
   C1 C2
a   1  x
b   2  y
c   3  z

--- Numpy ---
[]


# Adding elements

In [5]:
# Strings
print("\n--- Strings ---")
# Adding (concatenating) multiple strings together
s1 = "Hello"
s2 = "World"
s = s1 + s2
print(s)
# Combining a list/array of strings into a single string
s_list = ['Hello', 'World']
s = "".join(s_list) # <-- The first "" is the separator used to join the words in the list. Empty string "" means no separator, the word will be joined without any separator in between. If you want space, try this: " ".join(s_list)
print(s)

# List
print("\n--- List ---")
lst = [1, 'c', 3.1, True]
# Adding a single element in the list
lst.append('new')
# Adding multiple elements in the list
lst.extend(['newer', 'newest'])
# Adding to a specific index
lst.insert(0, "index[0] item")
print(lst)

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl1 = (1, 2, 3)
tpl2 = (4, 5, 6)
print(tpl1 + tpl2) # But you can create a new tuple with elements from other tuples

# Set
print("\n--- Set ---")
st = {1, 2, 3}
# Adding single element
st.add(4)
# Adding multiple elements
st.update([5, 6, 7])
print(st)

# Dictionary
print("\n--- Dictionary ---")
dct = {'a': 1, 'b': 2}
# Adding single key-value pair
dct['c'] = 3
# Adding multiple key-value pairs
dct.update({'d': 4, 'e': 5})
print(dct)

# Series
print("\n--- Series ---")
ser = pd.Series([1, 'c', 3.1, True], index=[0,1,2,3], name='A')
ser['index'] = 4
print(ser)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
df = pd.DataFrame({'C1': [1,2,3], 'C2': ['x', 'y', 'z']}, index=['a', 'b', 'c'])
df['Col'] = ['v1', 'v2', 'v3'] # This needs to have 3 values, not less not more
df['new_col'] = df['Col']
df['newer_col'] = range(len(df))
df['newest_col'] = np.random.randint(0,100,len(df))
# Note: while adding a column from existing dataframe, it will only add values where indexes are matching
print(df)
print(f"len(df) --> ", len(df))

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
HelloWorld
HelloWorld

--- List ---
['index[0] item', 1, 'c', 3.1, True, 'new', 'newer', 'newest']

--- Tuple ---
(1, 2, 3, 4, 5, 6)

--- Set ---
{1, 2, 3, 4, 5, 6, 7}

--- Dictionary ---
{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}

--- Series ---
0           1
1           c
2         3.1
3        True
index       4
Name: A, dtype: object

--- Data Frame (df) ---
   C1 C2 Col new_col  newer_col  newest_col
a   1  x  v1      v1          0          63
b   2  y  v2      v2          1          89
c   3  z  v3      v3          2          38
len(df) -->  3

--- Numpy ---
[]


## Removing elements

In [6]:
# Strings
print("\n--- Strings ---")
# Replacing a character in a string
s = "Hello"
s1 = s.removesuffix("lo")
s2 = s.removeprefix("He")
print(s1)
print(s2)

# List
print("\n--- List ---")
lst = [1, 'c', 3.1, True]
# Removing by value
lst.remove(1)
# Remove by index
popped_item = lst.pop(2)
print(f'popped item --> ', popped_item)
del lst[0]
print(lst)

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl1 = (1, 2, 3, 4)
# You can create a new tuple from a existing tuple by filtering some values
tpl2 = tuple(x for x in tpl1 if x % 2 == 0)
print(tpl2)

# Set
print("\n--- Set ---")
st = {1, 2, 3}
# Removing an element using remove(), throws an error if element is not found.
st.remove(3)
# Removing an element using discard(), doesn't throw an error if element is not found.
st.discard(3)
print(st)

# Dictionary
print("\n--- Dictionary ---")
dct = {'a': 1, 'b': 2}
# Removing from dict using del, throws an error if element is not found
del dct['a']
# Removing from dict using pop. A default value can be provided if the element is not found
popped_item = dct.pop('a', "Not found!")
print("popped_item --> ", popped_item)
print(dct)

# Series
print("\n--- Series ---")
ser = pd.Series([1, 'c', 3.1, True], index=[0,1,2,3], name='A')
# Drop by providing the index of the element to be dropped
ser = ser.drop(3) # Note that drop function does not drop in place. The returned value is the new series with the given index dropped from the original series. So make sure to assign it to a new value.
print(ser)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
df = pd.DataFrame({'C1': [1,2,3], 'C2': ['x', 'y', 'z']}, index=['a', 'b', 'c'])
# Remove a column (e.g., 'C1')
df_no_col = df.drop('C1', axis=1)
print("DataFrame after dropping column 'C1':")
print(df_no_col)
# Remove a row (e.g., index 'b')
df_no_row = df.drop('b', axis=0)
print("\nDataFrame after dropping row 'b':")
print(df_no_row)
print(f"len(df) --> ", len(df))

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
Hel
llo

--- List ---
popped item -->  True
[3.1]

--- Tuple ---
(2, 4)

--- Set ---
{1, 2}

--- Dictionary ---
popped_item -->  Not found!
{'b': 2}

--- Series ---
0      1
1      c
2    3.1
Name: A, dtype: object

--- Data Frame (df) ---
DataFrame after dropping column 'C1':
  C2
a  x
b  y
c  z

DataFrame after dropping row 'b':
   C1 C2
a   1  x
c   3  z
len(df) -->  3

--- Numpy ---
[]


## Replacing Values

In [7]:
# Strings
print("\n--- Strings ---")
# Replacing a character in a string
s = "Hello"
s1 = s.replace("o", "") # replace() function does not replace the character(s) in place. It returns the copy after making the updates specified in the function. The original string `s` will remain unchanged. 
print(s1)

# List
print("\n--- List ---")
lst = [1, 'c', 3.1, True]
lst[0] = 0
print(lst)

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl1 = (1, 2, 3, 4)
# You can create a new tuple from a existing tuple by filtering some values
tpl2 = tuple(x for x in tpl1 if x % 2 == 0)
print(tpl2)

# Set
print("\n--- Set ---")
st = {1, 2, 3}
st2 = {x for x in st if x % 2 == 0}
print(st2)

# Dictionary
print("\n--- Dictionary ---")
dct = {'a': 1, 'b': 2}
dct['a'] = 3
print(dct)

# Series
print("\n--- Series ---")
ser = pd.Series([1, 'c', 3.1, True], index=[0,1,2,3], name='A')
ser.replace(1, 2, inplace=True) # Without providing the inplace=True argument, we would need to assign the return value from the function into a new variable
print(ser)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
df = pd.DataFrame({'C1': [1,2,3], 'C2': ['x', 'y', 'z']}, index=['a', 'b', 'c'])
df.replace(to_replace=[1,2,3], value=[4,5,6], inplace=True)
print(df)
print(f"len(df) --> ", len(df))

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
Hell

--- List ---
[0, 'c', 3.1, True]

--- Tuple ---
(2, 4)

--- Set ---
{2}

--- Dictionary ---
{'a': 3, 'b': 2}

--- Series ---
0      2
1      c
2    3.1
3      2
Name: A, dtype: object

--- Data Frame (df) ---
   C1 C2
a   4  x
b   5  y
c   6  z
len(df) -->  3

--- Numpy ---
[]


# Renaming index

In [8]:
print('--- Not possible to do this in String, List, Tuple, Set ---')

# Dictionary
print("\n--- Dictionary ---")
dct = {'a': 1, 'b': 2}
# Rename key 'b' to 'x'
dct['x'] = dct.pop('b')
print(dct)

# Series
print("\n--- Series ---")
ser = pd.Series([1, 'c', 3.1, True], index=[0,1,2,3], name='A')
ser.rename({0: 'a', 2: 'c'}, inplace=True)
print(ser)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
df = pd.DataFrame({'C1': [1,2,3], 'C2': ['x', 'y', 'z']}, index=['a', 'b', 'c'])
# Rename columns
df.rename(columns={'C1': 'Col1', 'C2': 'Col2'}, inplace=True)
print(df)
print('--------')
# Rename index
df.rename(index={'a': 'i', 'b': 'j'}, inplace=True)
print(df)
print('--------')
# Set columns directly
df.columns = ['col1','col2']
print(df)
print('--------')
# Set index on the columns
df.set_index(keys=['col1','col2'], inplace=True) # Uses a column as index
print(df)
print('--------')
df = pd.DataFrame({'C1': [1,2,3], 'C2': ['x', 'y', 'z']}, index=['a', 'b', 'c'])
df.index = df['C2'] # Keeps original column. Can drop it separately
print(df)
print('--------')
df = pd.DataFrame({'C1': [1,2,3], 'C2': ['x', 'y', 'z']}, index=['a', 'b', 'c'])
df.reset_index() # replaces old with new and stores old as a column in df
print(df)
print('--------')
# Rows
df.index=['idx1','idx2','idx3']
print(df)
print('--------')
df.index=range(len(df))
print(df)
print(f"len(df) --> ", len(df))

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)

--- Not possible to do this in String, List, Tuple, Set ---

--- Dictionary ---
{'a': 1, 'x': 2}

--- Series ---
a       1
1       c
c     3.1
3    True
Name: A, dtype: object

--- Data Frame (df) ---
   Col1 Col2
a     1    x
b     2    y
c     3    z
--------
   Col1 Col2
i     1    x
j     2    y
c     3    z
--------
   col1 col2
i     1    x
j     2    y
c     3    z
--------
Empty DataFrame
Columns: []
Index: [(1, x), (2, y), (3, z)]
--------
    C1 C2
C2       
x    1  x
y    2  y
z    3  z
--------
   C1 C2
a   1  x
b   2  y
c   3  z
--------
      C1 C2
idx1   1  x
idx2   2  y
idx3   3  z
--------
   C1 C2
0   1  x
1   2  y
2   3  z
len(df) -->  3

--- Numpy ---
[]


## Create a copy

In [9]:
# Strings
print("\n--- Strings ---")
s = "Hello"
s1 = s[:]
print(s1)

# List
print("\n--- List ---")
lst = [1, 'c', 3.1, True]
lst_copy = lst[:]
print(lst_copy)

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl1 = (1, 2, 3, 4)
# You can create a new tuple from a existing tuple by filtering some values
tpl2 = tpl1[:]
print(tpl2)

# Set
print("\n--- Set ---")
st = {1, 2, 3}
st_copy = st.copy()
print(st_copy)

# Dictionary
print("\n--- Dictionary ---")
dct = {'a': 1, 'b': 2}
dct_copy = dct.copy()
print(dct_copy)

# Series
print("\n--- Series ---")
ser = pd.Series([1, 'c', 3.1, True], index=[0,1,2,3], name='A')
ser_copy = ser.copy()
print(ser_copy)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
df = pd.DataFrame({'C1': [1,2,3], 'C2': ['x', 'y', 'z']}, index=['a', 'b', 'c'])
df_copy = df.copy()
print(df_copy)
print(f"len(df) --> ", len(df))

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
Hello

--- List ---
[1, 'c', 3.1, True]

--- Tuple ---
(1, 2, 3, 4)

--- Set ---
{1, 2, 3}

--- Dictionary ---
{'a': 1, 'b': 2}

--- Series ---
0       1
1       c
2     3.1
3    True
Name: A, dtype: object

--- Data Frame (df) ---
   C1 C2
a   1  x
b   2  y
c   3  z
len(df) -->  3

--- Numpy ---
[]


## Check if an element exists

In [10]:
# Strings
print("\n--- Strings ---")
s = "Hello"
count = "ell" in s
print(count)

# List
print("\n--- List ---")
lst = [1, 'c', 3.1, True]
count = 'c' in lst
print(count)

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl = (1, 2, 3, 4)
count = 4 in tpl
print(count)

# Set
print("\n--- Set ---")
st = {1, 2, 3}
count = 3 in st
print(count)

# Dictionary
print("\n--- Dictionary ---")
dct = {'a': 1, 'b': 2}
count = 'a' in dct
print(count)

# Series
print("\n--- Series ---")
ser = pd.Series([1, 'c', 3.1, True], index=[0,1,2,3], name='A')
count = 'c' in ser.values
print(count)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
df = pd.DataFrame({'C1': [1,2,3], 'C2': ['x', 'y', 'z']}, index=['a', 'b', 'c'])
is_present1 = (df == 5).any().any()
print(is_present1)
lst = [1, 'x']  # Example list of values you want to check in each row
# Check if all values in lst are present in each row
is_present2 = df.isin(lst).all(axis=1)
print(is_present2)
print(f"len(df) --> ", len(df))

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
True

--- List ---
True

--- Tuple ---
True

--- Set ---
True

--- Dictionary ---
True

--- Series ---
True

--- Data Frame (df) ---
False
a     True
b    False
c    False
dtype: bool
len(df) -->  3

--- Numpy ---
[]


# Check index of element, column and row

In [11]:
# Strings
print("\n--- Strings ---")
s = "Hello"
count = s.find("ell")
print(count)

# List
print("\n--- List ---")
lst = [1, 'c', 3.1, True]
count = lst.index('c')
print(count)

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl = (1, 2, 3, 4)
count = tpl.index(4)
print(count)

# Set
print("\n--- Set ---")
st = {1, 2, 3}
# I don't think it makes sense to check for index of an item in a set. So if you're converting it to a list it no longer belongs in this category. Keeping it here for your reference since it was on your doc.
count = list(st).index(3)
print(count)

# Dictionary
print("\n--- Dictionary ---")
dct = {'a': 1, 'b': 2}
count = next((k for k, v in dct.items() if v == 2), None)
print(count)
# Another way is to convert to list but again doesn't really make sense, check my comment about in set()
count = list(dct.keys()).index('b')
print(count)

# Series
print("\n--- Series ---")
ser = pd.Series([1, 'c', 3.1, True], index=[0,1,2,3], name='A')
count = ser[ser == 'c'].index[0]
print(count)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
df = pd.DataFrame({'C1': [1,2,3], 'C2': ['x','y','z']}, index=['a','b','c'])
# Element
index = df.stack().index[df.stack() == 2][0]
print(f'index --> {index}')
# print(f'idxmin/max(): {df["C1"].idxmin/max()}')
# Returns all column index
print(f"df.columns --> { df.columns }")
print(f"df.columns.tolist() --> { df.columns.tolist() }")
# Specific column
print(f"df.columns[0] --> { df.columns[0] }")
print(f"df.columns.get_loc('C1') --> { df.columns.get_loc('C1') }") #1
print(f"df.index.get_loc('c') --> { df.index.get_loc('c') }") #1
# Returns all row index
print(f"df.index --> { df.index }")
print(f"df.index[0] --> { df.index[0] }")
print(f"df.index.tolist() --> { df.index.tolist() }")
print(f"len(df) --> ", len(df))

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
1

--- List ---
1

--- Tuple ---
3

--- Set ---
2

--- Dictionary ---
b
1

--- Series ---
1

--- Data Frame (df) ---
index --> ('b', 'C1')
df.columns --> Index(['C1', 'C2'], dtype='object')
df.columns.tolist() --> ['C1', 'C2']
df.columns[0] --> C1
df.columns.get_loc('C1') --> 0
df.index.get_loc('c') --> 2
df.index --> Index(['a', 'b', 'c'], dtype='object')
df.index[0] --> a
df.index.tolist() --> ['a', 'b', 'c']
len(df) -->  3

--- Numpy ---
[]


## Counting occurrences of specific values

In [12]:
# Strings
print("\n--- Strings ---")
s = "Hello"
count = s.count("l")
print(count)

# List
print("\n--- List ---")
lst = [1, 'c', 'c', 3.1, True]
count = lst.count('c')
print(count)

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl = (1, 2, 3, 4, 4)
count = tpl.index(4)
print(count)

# Set
print("\n--- Set ---")
print("N/A since list cannot contain duplicates")

# Dictionary
print("\n--- Dictionary ---")
dct = {'a': 1, 'b': 2, 'c': 2}
# Convert dict values to a list and then count occurrences of 2
count = list(dct.values()).count(2)
print(count)

# Series
print("\n--- Series ---")
ser = pd.Series([1, 'c', 'c', 3.1, True], index=[0,1,2,3,4], name='A')
# Method 1: Using value_counts
count_c = ser.value_counts().get('c', 0)
print("Occurrences of 'c':", count_c)
# Method 2: Using sum with a boolean condition
count_c = (ser == 'c').sum()
print("Occurrences of 'c':", count_c)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
df = pd.DataFrame({'C1': ['1','2','3'], 'C2': ['x','y','z']}, index=['a','b','c'])
# Count occurrences of each value in each column
print("\n--- Count occurrences of each value in each column ---")
print(df.apply(pd.Series.value_counts))
# Count a specific value (e.g., 2) in column 'C1'
print("\n--- Count of value 2 in column 'C1' ---")
print(df['C1'].value_counts().get(2, 0))
print(f"len(df) --> ", len(df))

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
2

--- List ---
2

--- Tuple ---
3

--- Set ---
N/A since list cannot contain duplicates

--- Dictionary ---
2

--- Series ---
Occurrences of 'c': 2
Occurrences of 'c': 2

--- Data Frame (df) ---

--- Count occurrences of each value in each column ---
    C1   C2
1  1.0  NaN
2  1.0  NaN
3  1.0  NaN
x  NaN  1.0
y  NaN  1.0
z  NaN  1.0

--- Count of value 2 in column 'C1' ---
1
len(df) -->  3

--- Numpy ---
[]


  print(df['C1'].value_counts().get(2, 0))


## Iterate over

In [13]:
# Strings
print("\n--- Strings ---")
s = "Hello"
for char in s:
  print(char)

# List
print("\n--- List ---")
lst = [1, 'c', 'c', 3.1, True]
for element in lst:
  print(element)

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl = (1, 2, 3, 4, 4)
for element in tpl:
  print(element)

# Set
print("\n--- Set ---")
st = set([1, 2, 3, 4, 5])
for element in st:
  print(element)

# Dictionary
print("\n--- Dictionary ---")
dct = {'a': 1, 'b': 2, 'c': 3}
# Iterate over keys
print("Iterate over keys")
for key in dct:
  print(f'{key} -> {dct[key]}')
# Iterate over values
print("Iterate over values")
for value in dct.values():
  print(value)
# Iterate over items
print("Iterate over items")
for key, value in dct.items():
  print(f'{key} -> {value}')

# Series
print("\n--- Series ---")
ser = pd.Series([1, 'c', 'c', 3.1, True], index=[0,1,2,3,4], name='A')
# Iterates over values
print("Iterate over values")
for value in ser:
  print(value)
# Iterates over index-value
print("Iterate over index-value")
for index, value in ser.items():
  print(f'{index} -> {value}')

# Data Frame (df)
print("\n--- Data Frame (df) ---")
# Sample DataFrame
df = pd.DataFrame({
    'C1': [1, 2, 3],
    'C2': ['x', 'y', 'z'],
    'C3': [4.5, 5.5, 6.5]
}, index=['a', 'b', 'c'])

# --- Iterate over columns ---
print("\n--- Iterate over columns ---")
for column in df:
    # 'column' is the column name (e.g., 'C1', 'C2', 'C3')
    print(f"Column: {column}")
    print(df[column])  # Access the Series of each column

# --- Iterate over columns by index and Series ---
print("\n--- Iterate over columns by index and Series ---")
for index, col in df.items():
    # 'index' is the column name, 'col' is the Series of that column
    print(f"Column name: {index}")
    print("Series data:")
    print(col)

# --- Iterate over rows by index and Series ---
print("\n--- Iterate over rows by index and Series ---")
for index, row in df.iterrows():
    # 'index' is the row index (e.g., 'a', 'b', 'c')
    # 'row' is a Series representing the row data
    print(f"Row index: {index}")
    print("Row data:")
    print(row)

# --- Iterate over rows by index and value as a tuple ---
print("\n--- Iterate over rows by index and value as a tuple ---")
for row in df.itertuples():
    # Each 'row' is a named tuple where fields are column names
    print(f"Row as tuple: {row}")
    # Access elements as attributes of the named tuple
    print(f"Index: {row.Index}, C1: {row.C1}, C2: {row.C2}, C3: {row.C3}")

# --- Iterate over individual cells ---
print("\n--- Iterate over individual cells ---")
for index, row in df.iterrows():
    for column in df.columns:
        # Access each cell value by row index and column name
        cell_value = row[column]
        print(f"Row index: {index}, Column: {column}, Value: {cell_value}")

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
H
e
l
l
o

--- List ---
1
c
c
3.1
True

--- Tuple ---
1
2
3
4
4

--- Set ---
1
2
3
4
5

--- Dictionary ---
Iterate over keys
a -> 1
b -> 2
c -> 3
Iterate over values
1
2
3
Iterate over items
a -> 1
b -> 2
c -> 3

--- Series ---
Iterate over values
1
c
c
3.1
True
Iterate over index-value
0 -> 1
1 -> c
2 -> c
3 -> 3.1
4 -> True

--- Data Frame (df) ---

--- Iterate over columns ---
Column: C1
a    1
b    2
c    3
Name: C1, dtype: int64
Column: C2
a    x
b    y
c    z
Name: C2, dtype: object
Column: C3
a    4.5
b    5.5
c    6.5
Name: C3, dtype: float64

--- Iterate over columns by index and Series ---
Column name: C1
Series data:
a    1
b    2
c    3
Name: C1, dtype: int64
Column name: C2
Series data:
a    x
b    y
c    z
Name: C2, dtype: object
Column name: C3
Series data:
a    4.5
b    5.5
c    6.5
Name: C3, dtype: float64

--- Iterate over rows by index and Series ---
Row index: a
Row data:
C1      1
C2      x
C3    4.5
Name: a, dtype: object
Row index: b
Row data:
C1

In [14]:
## Sorting

In [15]:
# Strings
print("\n--- Strings ---")
s = "Hello" # <-- H < e because ASCII value of H is 73 while that of e is 102, therefore the sorted version of this string is still "Hello"
print(''.join(sorted(s)))
s = "hello" # <-- The sorted version of this string is what we'd expect which is ehllo
print(''.join(sorted(s)))

# List
print("\n--- List ---")
lst = [4,5,2,3,1]
# In place sorting
lst.sort()
# No in-place sorting, but return the sorted list
lst2 = [4,5,2,3,1]
sorted_list = sorted(lst2)
print(lst2)
print(sorted_list)

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl = (1, 4, 4, 3, 9, 2)
print(tuple(sorted(tpl))) # <-- Creates a new tuple with values from the original tuple in a sorted order. Does not modify the tuple in place because tuples are immutable.

# Set
print("\n--- Set ---")
st = set([1, 4, 5, 3, 2])
# Sets are not ordered collection so there is no concept of sorting in sets
# The sorted() function converts the set into a list, sorts it, and returns the sorted list
sorted_st = sorted(st)
print(sorted_list)

# Dictionary
print("\n--- Dictionary ---")
dct = {'b': 2, 'c': 3, 'a': 1}
# Dictionaries are not ordered collection so there is no concept of sorting in dictionaries
# The sorted() function converts the collection (keys/items, etc) into a list, sorts it, and returns the sorted list. The dict() function converts the list back into a dictionary
# Sort by keys
sort_by_keys = dict(sorted(dct.items()))
print(sort_by_keys)
# Sort by values
sort_by_values = dict(sorted(dct.items(), key=lambda item: item[1])) # <-- item is s collection of [key, val] where 0th index is key and 1st index is value, hence we return item[1] to be used as the comparator for sorting via the sorted() function
print(sort_by_values)

# Series
print("\n--- Series ---")
ser = pd.Series([1, 'c', 'c', 3.1, True], index=[0,1,2,3,4], name='A')
# Keep only numeric values
numeric_ser = ser[pd.to_numeric(ser, errors='coerce').notna()]
sorted_numeric_ser = numeric_ser.sort_values()
print("Sorted by values (numeric only):")
print(sorted_numeric_ser)
# Sort by index
sorted_index_ser = ser.sort_index()
print("Sorted by index:")
print(sorted_index_ser)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
# Sample DataFrame
data = {
    'A': [3, 1, 4, 1],
    'B': ['x', 'y', 'z', 'w'],
    'C': [10, 20, 15, 5]
}
df = pd.DataFrame(data)
print("Original DataFrame:\n", df)
sorted_df_by_A = df.sort_values(by='A')
print("\nSorted by column A:\n", sorted_df_by_A)
df.sort_values(by=['A', 'B'], inplace=True, ascending=True)
print("\nSorted by columns A and B:\n", df)
df.sort_index(inplace=True, ascending=False)
print("\nSorted by index in descending order:\n", df)
df_reordered = df[['B', 'A']]
print("\nReordered columns (B, A):\n", df_reordered)

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
Hello
ehllo

--- List ---
[4, 5, 2, 3, 1]
[1, 2, 3, 4, 5]

--- Tuple ---
(1, 2, 3, 4, 4, 9)

--- Set ---
[1, 2, 3, 4, 5]

--- Dictionary ---
{'a': 1, 'b': 2, 'c': 3}
{'a': 1, 'b': 2, 'c': 3}

--- Series ---
Sorted by values (numeric only):
0       1
4    True
3     3.1
Name: A, dtype: object
Sorted by index:
0       1
1       c
2       c
3     3.1
4    True
Name: A, dtype: object

--- Data Frame (df) ---
Original DataFrame:
    A  B   C
0  3  x  10
1  1  y  20
2  4  z  15
3  1  w   5

Sorted by column A:
    A  B   C
1  1  y  20
3  1  w   5
0  3  x  10
2  4  z  15

Sorted by columns A and B:
    A  B   C
3  1  w   5
1  1  y  20
0  3  x  10
2  4  z  15

Sorted by index in descending order:
    A  B   C
3  1  w   5
2  4  z  15
1  1  y  20
0  3  x  10

Reordered columns (B, A):
    B  A
3  w  1
2  z  4
1  y  1
0  x  3

--- Numpy ---
[]


## Filtering

In [16]:
# Strings
print("\n--- Strings ---")
s = "Hello"
print(''.join([char for char in s if char in 'aeiou']))

# List
print("\n--- List ---")
lst = [4,5,2,3,1]
print([x for x in lst if x > 2])

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl = (1, 4, 4, 3, 9, 2)
print(tuple(x for x in tpl if x > 2))

# Set
print("\n--- Set ---")
st = set([1, 4, 5, 3, 2])
print({x for x in st if x > 2})

# Dictionary
print("\n--- Dictionary ---")
dct = {'b': 2, 'c': 3, 'a': 1}
print({k: v for k, v in dct.items() if v > 2})

# Series
print("\n--- Series ---")
# Sample Series
ser = pd.Series([1, 3, 2, 4, 5, 1], name='ExampleSeries')
print("Original Series:\n", ser)
# Filter Series to include only values greater than 2
filtered_ser = ser[ser > 2]
print("\nFiltered Series (values > 2):\n", filtered_ser)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
# Sample DataFrame
data = {
    'A': [3, 1, 4, 1, 2, 5],
    'B': ['x', 'y', 'z', 'hello', 'world', 'hello world'],
    'C': [10, 20, 15, 5, 7, 3]
}
df = pd.DataFrame(data)
print("Original DataFrame:\n", df)
# 1. Filter rows where values in column A are greater than 2
filtered_df_1 = df[df['A'] > 2]
print("\nRows where A > 2:\n", filtered_df_1)
# 2. Filter rows where values in column A are either greater than or equal to 1.0 or less than 0.0
filtered_df_2 = df[(df['A'] >= 1.0) | (df['A'] < 0.0)]
print("\nRows where A >= 1.0 or A < 0.0:\n", filtered_df_2)
# 3. Filter rows where values in column A are not in the list [1, 2, 5, 7, 11]
filtered_df_3 = df[~df['A'].isin([1, 2, 5, 7, 11])]
print("\nRows where A is not in [1, 2, 5, 7, 11]:\n", filtered_df_3)
# 4. Filter rows where column B contains the substring 'hello'
filtered_df_4 = df[df['B'].str.contains('hello')]
print("\nRows where B contains 'hello':\n", filtered_df_4)

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
eo

--- List ---
[4, 5, 3]

--- Tuple ---
(4, 4, 3, 9)

--- Set ---
{3, 4, 5}

--- Dictionary ---
{'c': 3}

--- Series ---
Original Series:
 0    1
1    3
2    2
3    4
4    5
5    1
Name: ExampleSeries, dtype: int64

Filtered Series (values > 2):
 1    3
3    4
4    5
Name: ExampleSeries, dtype: int64

--- Data Frame (df) ---
Original DataFrame:
    A            B   C
0  3            x  10
1  1            y  20
2  4            z  15
3  1        hello   5
4  2        world   7
5  5  hello world   3

Rows where A > 2:
    A            B   C
0  3            x  10
2  4            z  15
5  5  hello world   3

Rows where A >= 1.0 or A < 0.0:
    A            B   C
0  3            x  10
1  1            y  20
2  4            z  15
3  1        hello   5
4  2        world   7
5  5  hello world   3

Rows where A is not in [1, 2, 5, 7, 11]:
    A  B   C
0  3  x  10
2  4  z  15

Rows where B contains 'hello':
    A            B  C
3  1        hello  5
5  5  hello world  3

--- Num

## Selecting

In [17]:
# Strings
print("\n--- Strings ---")
s = "Hello"
print(s[0], s[2:3])

# List
print("\n--- List ---")
lst = [4,5,2,3,1]
print(lst[2], lst[3:])

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl = (1, 4, 4, 3, 9, 2)
print(tpl[2], tpl[:])

# Set
print("\n--- Set ---")
st = set([1, 4, 5, 3, 2])
# Convert to list to access element by index since sets are unordered elements
print(list(st)[1])

# Dictionary
print("\n--- Dictionary ---")
dct = {'b': 2, 'c': 3, 'a': 1}
# Select using key
print(dct['b'])
# Select multiple by filtering
print({k: dct[k] for k in ['a', 'c']})

# Series
print("\n--- Series ---")
# Sample Series
ser = pd.Series([1, 3, 2, 4, 5, 1], index=['a', 'b', 'c', 'd', 'e', 'f'], name='ExampleSeries')
# 1. Select by label
# Select the value at index 'b'
selected_by_label = ser['b']
print("\nValue at label 'b':\n", selected_by_label)
# Select by multiple labels (e.g., 'a' and 'c')
selected_by_labels = ser[['a', 'c']]
print("\nValues at labels 'a' and 'c':\n", selected_by_labels)
# 2. Select by position
# Select the value at position 2 (third element)
selected_by_position = ser.iloc[2]
print("\nValue at position 2:\n", selected_by_position)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
# Sample DataFrame
data = {
    'A': [1, 2, 3, 4],
    'B': ['x', 'y', 'z', 'w'],
    'C': [10, 20, 30, 40]
}
df = pd.DataFrame(data)
print("Original DataFrame:\n", df)

# 1. Select 1 column
# Select a single column 'A'
print("\nSelect single column 'A':\n", df['A'])

# 2. df.select(['A', 'B'], axis=1)
# Select columns 'A' and 'B'
# print("\nSelect columns 'A' and 'B':\n", df.select_dtypes(include=['A', 'B']))

# 3. df.select([0, 1], axis=0)
# Select rows by position (row 0 and row 1)
print("\nSelect rows 0 and 1:\n", df.iloc[[0, 1]])

# 4. df.select(lambda col: col.startswith('A'), axis=1)
# Select columns whose names start with 'A'
print("\nSelect columns starting with 'A':\n", df.filter(regex='^A', axis=1))

# 5. df[0] selects column & df[1:3] selects rows
# Select column at position 0 (first column)
print("\nSelect column at position 0:\n", df.iloc[:, 0])
# Select rows from position 1 to 2 (2 is exclusive)
print("\nSelect rows 1 to 2:\n", df.iloc[1:3])

# 6. df['A'] ~df.A Series output
# Select column 'A' using both methods
print("\nSelect column 'A' using df['A'] and df.A:\n", df['A'], "\n", df.A)

# 7. df[['A']] #df
# Select a column as DataFrame (not Series)
print("\nSelect column 'A' as DataFrame:\n", df[['A']])

# 8. df[['A','B']]
# Select columns 'A' and 'B'
print("\nSelect columns 'A' and 'B':\n", df[['A', 'B']])

# 9. df.filter(items=['A']) df output
# Filter specific columns by name
print("\nFilter column 'A' using filter:\n", df.filter(items=['A']))

# 10. df[df.columns[0]]
# Select first column by index using df.columns
print("\nSelect first column by index using df.columns[0]:\n", df[df.columns[0]])

# 11. df[df.columns[0, 1, 3]]
# Select columns by index positions (note: column 3 doesn't exist here)
# Will raise an IndexError if column 3 doesn't exist
# print(df[df.columns[0, 1, 3]])  # Uncommenting will raise an error

# 12. df[df['A'] > 0.5]
# Select rows where column 'A' is greater than 0.5
print("\nSelect rows where 'A' > 0.5:\n", df[df['A'] > 0.5])

# 13. Select a single row by index
# Select row by index 2
print("\nSelect row by index 2 using loc:\n", df.loc[2])

# 14. df.iloc[1] Series output
# Select row by position 1
print("\nSelect row by position 1:\n", df.iloc[1])

# 15. df.iloc[1:2] DataFrame output
# Select rows by position (row 1 only)
print("\nSelect rows 1 to 1 (DataFrame output):\n", df.iloc[1:2])

# 16. df.filter(items=[2], axis='index')
# Select a specific row using filter by index
print("\nSelect row with index 2 using filter:\n", df.filter(items=[2], axis='index'))

# 17. Specific rows and columns using loc
# Select rows where column 'A' is greater than or equal to 2
print("\nSelect rows where A >= 2:\n", df.loc[df[df['A'] >= 2].index])

# Select rows 1 to 3 and columns 'A' and 'B'
print("\nSelect rows 1 to 3 and columns 'A' and 'B':\n", df.loc[1:3, ['A', 'B']])

# Select rows 1 to 3 and columns 0 and 1 (excluding column 'B')
# print("\nSelect rows 1 to 3 and columns 0 and 1:\n", df.loc[1:3, [0, 1]])

# 18. Loc uses labels to access the rows and columns while iloc uses integer positions
# Explanation:
# loc uses inclusive index labels, iloc does not.

# 19. Select multiple columns
# Filter columns containing 'A'
print("\nFilter columns containing 'A':\n", df.filter(like='A'))

# Filter columns using a condition
# print("\nFilter columns excluding 'B':\n", df.filter(lambda x: x != 'B'))

# Select columns 'A' and 'B' using loc
print("\nSelect columns 'A' and 'B' using loc:\n", df.loc[:, ['A', 'B']])

# 20. Selecting rows
# Select all rows using [:] or [::]
print("\nSelect all rows:\n", df[:])

# Select rows from 'a' to 'c' (inclusive)
# print("\nSelect rows from 'a' to 'c' (inclusive):\n", df['a':'c'])

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
H l

--- List ---
2 [3, 1]

--- Tuple ---
4 (1, 4, 4, 3, 9, 2)

--- Set ---
2

--- Dictionary ---
2
{'a': 1, 'c': 3}

--- Series ---

Value at label 'b':
 3

Values at labels 'a' and 'c':
 a    1
c    2
Name: ExampleSeries, dtype: int64

Value at position 2:
 2

--- Data Frame (df) ---
Original DataFrame:
    A  B   C
0  1  x  10
1  2  y  20
2  3  z  30
3  4  w  40

Select single column 'A':
 0    1
1    2
2    3
3    4
Name: A, dtype: int64

Select rows 0 and 1:
    A  B   C
0  1  x  10
1  2  y  20

Select columns starting with 'A':
    A
0  1
1  2
2  3
3  4

Select column at position 0:
 0    1
1    2
2    3
3    4
Name: A, dtype: int64

Select rows 1 to 2:
    A  B   C
1  2  y  20
2  3  z  30

Select column 'A' using df['A'] and df.A:
 0    1
1    2
2    3
3    4
Name: A, dtype: int64 
 0    1
1    2
2    3
3    4
Name: A, dtype: int64

Select column 'A' as DataFrame:
    A
0  1
1  2
2  3
3  4

Select columns 'A' and 'B':
    A  B
0  1  x
1  2  y
2  3  z
3  4  w

Fi

## Remove Duplicates  

In [18]:
# Data Frame (df)
print("\n--- Data Frame (df) ---")

# Sample DataFrame
data = {
    'A': [1, 2, 2, 4, 4, 1],
    'B': ['x', 'y', 'y', 'w', 'w', 'x'],
    'C': [10, 20, 20, 40, 40, 10]
}
df = pd.DataFrame(data)

print("Original DataFrame:\n", df)

# 1. Drop duplicates based on columns 'A' and 'B'
df.drop_duplicates(subset=['A', 'B'], inplace=True)
print("\nDataFrame after dropping duplicates based on 'A' and 'B':\n", df)

# Reset the DataFrame to its original state for the next operation
df = pd.DataFrame(data)

# 2. Drop duplicates based on row indexes
# Add the row index as a column to facilitate duplicate dropping based on indexes
df['index'] = df.index

# Drop duplicates based on the 'index' column, keeping the last occurrence
df = df.drop_duplicates(subset='index', keep='last')

# Drop the 'index' column after processing
df.drop(columns='index', inplace=True)

# Sort the DataFrame by index
df.sort_index(inplace=True)

print("\nDataFrame after dropping duplicates based on index, keeping the last occurrence:\n", df)



--- Data Frame (df) ---
Original DataFrame:
    A  B   C
0  1  x  10
1  2  y  20
2  2  y  20
3  4  w  40
4  4  w  40
5  1  x  10

DataFrame after dropping duplicates based on 'A' and 'B':
    A  B   C
0  1  x  10
1  2  y  20
3  4  w  40

DataFrame after dropping duplicates based on index, keeping the last occurrence:
    A  B   C
0  1  x  10
1  2  y  20
2  2  y  20
3  4  w  40
4  4  w  40
5  1  x  10


## Conversions

In [19]:
# Keeping this for last since the Google docs table is very well done for this.

## Change Datatype

In [20]:
# Strings
print("\n--- Strings ---")
s = "65"
print(int(s))
print(float(s))

# List
print("\n--- List ---")
lst = [4,5,2,3,1]
print([ str(i) for i in lst ])
lst2 = ['4','5','2','3','1']
print([int(i) for i in lst2])
print([float(i) for i in lst2])

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl = ('1', '4', '4', '3', '9', '2')
print(tuple(int(x) for x in tpl))
print(tuple(float(x) for x in tpl))

# Set
print("\n--- Set ---")
st = set(['1', '4', '5', '3', '2'])
print({int(x) for x in st})
st2 = set([1, 4, 5, 3, 2])
print({str(x) for x in st})

# Dictionary
print("\n--- Dictionary ---")
dct = {'b': 2, 'c': 3, 'a': 1}
print({k: str(v) for k, v in dct.items()})

# Series
print("\n--- Series ---")
# Sample Series with mixed data types
ser = pd.Series([1.5, 2.3, 3.7, 4.1, 5.6])
print("Original Series:\n", ser)
# 1. Convert to integer type
ser_int = ser.astype(int)
print("\nSeries converted to int:\n", ser_int)
# 2. Convert to float type (if not already float)
ser_float = ser.astype(float)
print("\nSeries converted to float:\n", ser_float)
# 3. Convert to string type
ser_str = ser.astype(str)
print("\nSeries converted to string:\n", ser_str)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
# Sample DataFrame
data = {
    'A': [1.5, 2.3, 3.7, 4.1, 5.6],
    'B': ['x', 'y', 'z', 'w', 'v']
}
df = pd.DataFrame(data)
print("Original DataFrame:\n", df)
# 1. Convert column 'A' to integer type
df['A_int'] = df['A'].astype(int)
print("\nDataFrame with 'A' converted to int:\n", df)
# 2. Convert column 'A' to float type
df['A_float'] = df['A'].astype(float)
print("\nDataFrame with 'A' converted to float:\n", df)
# 3. Convert column 'A' to string type
df['A_str'] = df['A'].astype(str)
print("\nDataFrame with 'A' converted to string:\n", df)

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
65
65.0

--- List ---
['4', '5', '2', '3', '1']
[4, 5, 2, 3, 1]
[4.0, 5.0, 2.0, 3.0, 1.0]

--- Tuple ---
(1, 4, 4, 3, 9, 2)
(1.0, 4.0, 4.0, 3.0, 9.0, 2.0)

--- Set ---
{1, 2, 3, 4, 5}
{'4', '3', '2', '5', '1'}

--- Dictionary ---
{'b': '2', 'c': '3', 'a': '1'}

--- Series ---
Original Series:
 0    1.5
1    2.3
2    3.7
3    4.1
4    5.6
dtype: float64

Series converted to int:
 0    1
1    2
2    3
3    4
4    5
dtype: int64

Series converted to float:
 0    1.5
1    2.3
2    3.7
3    4.1
4    5.6
dtype: float64

Series converted to string:
 0    1.5
1    2.3
2    3.7
3    4.1
4    5.6
dtype: object

--- Data Frame (df) ---
Original DataFrame:
      A  B
0  1.5  x
1  2.3  y
2  3.7  z
3  4.1  w
4  5.6  v

DataFrame with 'A' converted to int:
      A  B  A_int
0  1.5  x      1
1  2.3  y      2
2  3.7  z      3
3  4.1  w      4
4  5.6  v      5

DataFrame with 'A' converted to float:
      A  B  A_int  A_float
0  1.5  x      1      1.5
1  2.3  y      2      2.3
2  3.7  z

## Transposing

In [21]:
# Strings
print("\n--- Strings ---")
print("N/A")

print("\n--- List ---")
lst = [1, 2, 3]
lst2 = [4, 5, 6]
print([list(x) for x in zip(lst, lst2)])

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl = ((1,2), (3,4), (5,6))
print(tuple(zip(*tpl)))

# Set
print("\n--- Set ---")
st = set([ (1,2,3), (4,5,6) ])
print(set(zip(*st)))

# Dictionary
print("\n--- Dictionary ---")
dct = {'b': 2, 'c': 3, 'a': 1}
print({ value: key for key, value in dct.items() })

# Series
print("\n--- Series ---")
print("N/A")

# Data Frame (df)
print("\n--- Data Frame (df) ---")
# Sample DataFrame
data = {
    'A': [1.5, 2.3, 3.7, 4.1, 5.6],
    'B': ['x', 'y', 'z', 'w', 'v']
}
df = pd.DataFrame(data)
print(df.T)

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
N/A

--- List ---
[[1, 4], [2, 5], [3, 6]]

--- Tuple ---
((1, 3, 5), (2, 4, 6))

--- Set ---
{(2, 5), (1, 4), (3, 6)}

--- Dictionary ---
{2: 'b', 3: 'c', 1: 'a'}

--- Series ---
N/A

--- Data Frame (df) ---
     0    1    2    3    4
A  1.5  2.3  3.7  4.1  5.6
B    x    y    z    w    v

--- Numpy ---
[]


## +, -, *, / Based on key/index/location

In [22]:
# Strings
print("\n--- Strings ---")
s1 = "Hello"
s2 = "World"
print(s1 + " " + s2)

# List
print("\n--- List ---")
lst = [1, 2, 3]
lst2 = [4, 5, 6]
print([a + b for a, b in zip(lst, lst2)])

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl = (1,2,3)
tpl2 = (4,5,6)
print(tuple(a + b for a, b in zip(tpl, tpl2)))

# Set
print("\n--- Set ---")
st = set([1,2,3])
st2 = set([4,5,6 ])
print({a + b for a, b in zip(list(st), list(st2))})

# Dictionary
print("\n--- Dictionary ---")
dct = {'b': 2, 'c': 3, 'a': 1}
dct2 = {'d': 4, 'e': 5, 'f': 6}
print({ k: dct.get(k, 0) + dct2.get(k, 0) for k in set(dct) | set(dct2) })
print({k: v + 2 for k, v in dct.items()})

# Series
print("\n--- Series ---")
# Create two sample Series
ser1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
ser2 = pd.Series([4, 3, 2, 1], index=['a', 'b', 'c', 'd'])
# 1. Add two Series with fill_value=0 for missing values in either Series
result_add = ser1.add(ser2, fill_value=0)
print("Result of add(ser1, ser2, fill_value=0):\n", result_add)
# 2. Subtract one Series from another
result_sub = ser1.sub(ser2)
print("\nResult of sub(ser1, ser2):\n", result_sub)
# 3. Multiply two Series element-wise
result_mul = ser1.mul(ser2)
print("\nResult of mul(ser1, ser2):\n", result_mul)
# 4. Divide two Series element-wise
result_div = ser1.div(ser2)
print("\nResult of div(ser1, ser2):\n", result_div)
# 5. Add a constant value to each element of the Series
n = 10
result_add_n = ser1 + n
print("\nResult of ser1 + n (n=10):\n", result_add_n)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
# Sample DataFrames
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]
}, index=['a', 'b', 'c'])

df2 = pd.DataFrame({
    'A': [7, 8, 9],
    'B': [10, 11, 12]
}, index=['a', 'b', 'c'])

# Sample Series for dot product with df
ser = pd.Series([1, 2], index=['A', 'B'])

# 1. Add two DataFrames with fill_value=0 for missing values
result_add = df1.add(df2, fill_value=0)
print("Result of add(df1, df2, fill_value=0):\n", result_add)

# 2. Subtract one DataFrame from another
result_sub = df1.sub(df2)
print("\nResult of sub(df1, df2):\n", result_sub)

# 3. Multiply two DataFrames element-wise
result_mul = df1.mul(df2)
print("\nResult of mul(df1, df2):\n", result_mul)

# 4. Multiply DataFrame by a Series, axis=0 (broadcasting along rows)
result_mul_s = df1.mul(ser, axis=0)
print("\nResult of mul(df1, ser, axis=0):\n", result_mul_s)

# 5. Dot product of two DataFrames
result_dot = df1.dot(df2.T)  # df2.T transposed to align columns for dot product
print("\nResult of dot(df1, df2):\n", result_dot)

# 6. Dot product of a DataFrame and a Series
result_dot_ser = df1.dot(ser)
print("\nResult of dot(df1, ser):\n", result_dot_ser)

# 7. Divide one DataFrame by another element-wise
result_div = df1.div(df2)
print("\nResult of div(df1, df2):\n", result_div)

# 8. Add a constant value to all elements of a DataFrame
n = 10
result_add_n = df1 + n
print("\nResult of df1 + n (n=10):\n", result_add_n)

# 9. Create a new column 'C' based on a formula (A / B) * 100
df1['C'] = df1['A'] / df1['B'] * 100
print("\nDataFrame after adding 'C' (A / B * 100):\n", df1)

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
Hello World

--- List ---
[5, 7, 9]

--- Tuple ---
(5, 7, 9)

--- Set ---
{9, 5, 7}

--- Dictionary ---
{'d': 4, 'e': 5, 'b': 2, 'a': 1, 'c': 3, 'f': 6}
{'b': 4, 'c': 5, 'a': 3}

--- Series ---
Result of add(ser1, ser2, fill_value=0):
 a    5
b    5
c    5
d    5
dtype: int64

Result of sub(ser1, ser2):
 a   -3
b   -1
c    1
d    3
dtype: int64

Result of mul(ser1, ser2):
 a    4
b    6
c    6
d    4
dtype: int64

Result of div(ser1, ser2):
 a    0.250000
b    0.666667
c    1.500000
d    4.000000
dtype: float64

Result of ser1 + n (n=10):
 a    11
b    12
c    13
d    14
dtype: int64

--- Data Frame (df) ---
Result of add(df1, df2, fill_value=0):
     A   B
a   8  14
b  10  16
c  12  18

Result of sub(df1, df2):
    A  B
a -6 -6
b -6 -6
c -6 -6

Result of mul(df1, df2):
     A   B
a   7  40
b  16  55
c  27  72

Result of mul(df1, ser, axis=0):
     A   B
A NaN NaN
B NaN NaN
a NaN NaN
b NaN NaN
c NaN NaN

Result of dot(df1, df2):
     a   b   c
a  47  52  57
b  64  71  

## Ranking

In [23]:
# Strings
print("\n--- Strings ---")
print("N/A")

# List
print("\n--- List ---")
lst = [1, 2, 3]
# The output that is printed is of type dictionary, not a list
ranked = { val: rank + 1 for rank, val in enumerate(sorted(lst)) }
print(ranked)
print(type(ranked))

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
print("N/A")

# Set
print("\n--- Set ---")
print("N/A")

# Dictionary
print("\n--- Dictionary ---")
dct = {'b': 2, 'c': 3, 'a': 1}
ranked = {k: rank + 1 for rank, (k, _) in enumerate(sorted(dct.items(), key=lambda x: x[1]))}
print(ranked)

# Series
print("\n--- Series ---")
# Create a sample Series
ser = pd.Series([10, 20, 15, 30, 20, 10])
# Rank the Series
ranked_ser = ser.rank()
print("Original Series:\n", ser)
print("\nRanked Series:\n", ranked_ser)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
# Create a sample DataFrame
df = pd.DataFrame({
    'A': [10, 20, 15, 30, 20],
    'B': [5, 8, 7, 12, 10],
    'C': [3, 6, 5, 8, 6]
})
# Rank each column independently
ranked_df = df.rank()
print("Original DataFrame:\n", df)
print("\nRanked DataFrame:\n", ranked_df)

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
N/A

--- List ---
{1: 1, 2: 2, 3: 3}
<class 'dict'>

--- Tuple ---
N/A

--- Set ---
N/A

--- Dictionary ---
{'a': 1, 'b': 2, 'c': 3}

--- Series ---
Original Series:
 0    10
1    20
2    15
3    30
4    20
5    10
dtype: int64

Ranked Series:
 0    1.5
1    4.5
2    3.0
3    6.0
4    4.5
5    1.5
dtype: float64

--- Data Frame (df) ---
Original DataFrame:
     A   B  C
0  10   5  3
1  20   8  6
2  15   7  5
3  30  12  8
4  20  10  6

Ranked DataFrame:
      A    B    C
0  1.0  1.0  1.0
1  3.5  3.0  3.5
2  2.0  2.0  2.0
3  5.0  5.0  5.0
4  3.5  4.0  3.5

--- Numpy ---
[]


## 5 Random value generation -> import pandas, numpy, string, random

In [24]:
# Strings
print("\n--- Strings ---")
s = [chr(a+97) for a in range(26)]
# Generate a random string of 5 characters
random_string = ''.join(np.random.choice(a=s, size=5))
print(random_string)

# List
print("\n--- List ---")
print([ np.random.randint(1,100,size=5) ])

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
print(tuple(np.random.randint(1,100,size=5)))

# Set
print("\n--- Set ---")
print(np.random.randint(1,100,size=5)) # <-- The result of this is unhashable, therefore, it cannot be converted to a set

# Dictionary
print("\n--- Dictionary ---")
dct = {'b': 2, 'c': 3, 'a': 1}
print({f'key_{i}': np.random.randint(1,100) for i in range(5)})

# Series
print("\n--- Series ---")
# Create a sample Series
ser = pd.Series([np.random.randint(1, 100, size=5)])
print(ser)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
# Create a sample DataFrame
df = pd.DataFrame(np.random.randint(0, 100, size=(3,4)))
print(df)

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
avgxo

--- List ---
[array([35, 24, 59, 86, 10])]

--- Tuple ---
(np.int64(20), np.int64(49), np.int64(78), np.int64(16), np.int64(25))

--- Set ---
[86 66 65 28 18]

--- Dictionary ---
{'key_0': 90, 'key_1': 7, 'key_2': 5, 'key_3': 40, 'key_4': 82}

--- Series ---
0    [49, 61, 24, 67, 48]
dtype: object

--- Data Frame (df) ---
    0   1   2   3
0  46  37   2  41
1  37  60  51  63
2   4  47  74  83

--- Numpy ---
[]


# Math on whole Abs Sum CummSum Diff Max Mean Median

In [25]:
# Strings
print("\n--- Strings ---")
print("N/A")

# List
print("\n--- List ---")
lst = [1, -1, 2, -3, -5, 4, -6]
print([abs(x) for x in lst])
print(sum(lst))
print([ sum(lst[:i+1]) for i in range(len(lst)) ])
print([lst[i-1] for i in range(1, len(lst))])
print(max(lst))
print(sum(lst)/len(lst))

# Tuple
# Remember tuples are immutable so you cannot change/add/delete from an existing tuple
print("\n--- Tuple ---")
tpl = ( 1, -1, 2, -3, -5, 4, -6 )
print(tuple(abs(x) for x in tpl))
print(sum(tpl))
print(tuple(sum(tpl[:i+1]) for i in range(len(tpl))))
print(tuple(tpl[i] - tpl[i - 1] for i in range(1, len(tpl))))
print(max(tpl))
print(sum(tpl)/len(tpl))

# Set
print("\n--- Set ---")
# Sample data for examples
st = {-3, 5, -1, 8, -2}  # Set of integers
my_set = {1, 2, 3, 4, 5}  # Another set of integers
sorted_list = [1, 2, 4, 7, 11]  # Sorted list of integers
dict_a = {'a': 1, 'b': 3, 'c': 6, 'd': 10}  # Dictionary with values
num_set = {10, 20, 30, 40}  # Set of integers
# Example 1: Absolute values of each element in st
abs_values_set = {abs(x) for x in st}
print("Absolute values set:", abs_values_set)
# Example 2: Sum of all elements in my_set
total_sum = sum(my_set)
print("Sum of my_set:", total_sum)
# Example 3: Cumulative sum of elements in sorted_list
cumulative_sum_list = [sum(sorted_list[:i+1]) for i in range(len(sorted_list))]
print("Cumulative sums:", cumulative_sum_list)
# Example 4: Difference between each consecutive element in sorted_list
differences_list = [sorted_list[i] - sorted_list[i - 1] for i in range(1, len(sorted_list))]
print("Differences between consecutive elements:", differences_list)
# Example 5: Dictionary comprehension to calculate differences between consecutive values in dict_a
values = list(dict_a.values())
difference_dict = {values[i]: values[i] - values[i - 1] for i in range(1, len(values))}
print("Differences between consecutive dictionary values:", difference_dict)
# Example 6: Maximum value in st
max_value = max(st)
print("Max value in st:", max_value)
# Example 7: Average of elements in num_set
average_value = sum(num_set) / len(num_set)
print("Average of num_set:", average_value)

# Dictionary
print("\n--- Dictionary ---")
# Sample dictionaries for examples
dct = {'a': -3, 'b': 5, 'c': -7, 'd': 8}  # Dictionary with integer values
my_dict = {'x': 2, 'y': 3, 'z': 5}  # Dictionary with integer values
num_dict = {'p': 10, 'q': 20, 'r': 30, 's': 40}  # Dictionary with integer values
# Example 1: Dictionary with absolute values of each element in dct
abs_values_dict = {k: abs(v) for k, v in dct.items()}
print("Dictionary with absolute values:", abs_values_dict)
# Example 2: Sum of all values in my_dict
total_sum = sum(my_dict.values())
print("Sum of my_dict values:", total_sum)
# Example 3: Cumulative sum of values in my_dict
values = list(my_dict.values())
cumulative_sum = [sum(values[:i+1]) for i in range(len(values))]
print("Cumulative sum of values in my_dict:", cumulative_sum)
# Example 4: Dictionary with keys from my_dict and cumulative sums as values
cumulative_sum_dict = dict(zip(my_dict.keys(), cumulative_sum))
print("Dictionary with cumulative sums:", cumulative_sum_dict)
# Example 5: Maximum value in dct
max_value = max(dct.values())
print("Max value in dct:", max_value)
# Example 6: Average of values in num_dict
average_value = sum(num_dict.values()) / len(num_dict)
print("Average of num_dict values:", average_value)

# Series
print("\n--- Series ---")
# Create a sample Series
# Sample Series
ser = pd.Series([10, -20, 15, -25, 30], index=['a', 'b', 'c', 'd', 'e'])
# Absolute values of each element in the Series
abs_ser = ser.abs()
print("Absolute values:\n", abs_ser)
# Sum of all elements in the Series
total_sum = ser.sum()
print("Sum of elements:", total_sum)
# Cumulative sum of elements in the Series
cumulative_sum = ser.cumsum()
print("Cumulative sum:\n", cumulative_sum)
# Difference between each element and its previous one
diff_ser = ser.diff()
print("Differences between consecutive elements:\n", diff_ser)
# Maximum value in the Series
max_value = ser.max()
print("Maximum value:", max_value)
# Mean (average) of the Series
mean_value = ser.mean()
print("Mean value:", mean_value)
# Median of the Series
median_value = ser.median()
print("Median value:", median_value)

# Data Frame (df)
print("\n--- Data Frame (df) ---")
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [10, -20, 30, -40, 50],
    'C': [100, 200, 300, 400, 500]
}
df = pd.DataFrame(data)

# Filtering and applying a function to a specific column
filtered_sum = df['A'].abs().sum()
print("Absolute sum of column A:", filtered_sum)

# Absolute values of the DataFrame
abs_df = df.abs()
print("Absolute values:\n", abs_df)

# Sum for each column
col_sum = df.sum()
print("Sum of each column:\n", col_sum)

# Overall sum of all elements in the DataFrame
overall_sum = df.sum().sum()
print("Overall sum of all elements:", overall_sum)

# Sum of a specific column 'A'
sum_a = df['A'].sum()
print("Sum of column A:", sum_a)

# Cumulative sum for each column
cumulative_sum = df.cumsum(axis=0)
print("Cumulative sum:\n", cumulative_sum)

# Cumulative product for each column
cumulative_prod = df.cumprod()
print("Cumulative product:\n", cumulative_prod)

# Percent change with a specified period
pct_change = df.pct_change(periods=4)
print("Percent change with period 4:\n", pct_change)

# Rolling sum with window and minimum periods
rolling_sum = df.rolling(window=4, min_periods=4).sum()
print("Rolling sum:\n", rolling_sum)

# Difference between consecutive elements
diff = df.diff()
print("Difference between elements:\n", diff)

# Maximum value for each column
col_max = df.max()
print("Maximum value for each column:\n", col_max)

# Overall maximum value in the DataFrame
overall_max = df.max().max()
print("Overall maximum value:", overall_max)

# Mean of each column
col_mean = df.mean()
print("Mean of each column:\n", col_mean)

# Median of each column
col_median = df.median()
print("Median of each column:\n", col_median)

# Total number of elements in the DataFrame
total_size = df.size
print("Total number of elements:", total_size)

# Count of non-NA values for each column
non_na_count = df.count()
print("Count of non-NA values in each column:\n", non_na_count)

# Product of each column
col_prod = df.prod()
print("Product of each column:\n", col_prod)

# Summary statistics for the DataFrame
summary_stats = df.describe()
print("Summary statistics:\n", summary_stats)

# Value counts for a specific column
value_counts_b = df['B'].value_counts()
print("Value counts for column B:\n", value_counts_b)

# Data type of a specific column
dtype_a = df['A'].dtype
print("Data type of column A:", dtype_a)

# Covariance between two columns
cov_ab = df['A'].cov(df['B'])
print("Covariance between columns A and B:", cov_ab)

# Element-wise methods for a specific column
is_null = df['A'].isnull()
print("Is null for column A:\n", is_null)

not_null = df['A'].notnull()
print("Not null for column A:\n", not_null)

astype_float = df['A'].astype(float)
print("Column A as float:\n", astype_float)

absolute_a = df['A'].abs()
print("Absolute values of column A:\n", absolute_a)

rounded_a = df['A'].round(decimals=0)
print("Rounded values of column A:\n", rounded_a)

diff_a = df['A'].diff(periods=1)
print("Difference in column A:\n", diff_a)

shifted_a = df['A'].shift(periods=1)
print("Shifted values of column A:\n", shifted_a)

to_datetime_a = pd.to_datetime(df['A'], errors='coerce')
print("Column A as datetime:\n", to_datetime_a)

filled_a = df['A'].fillna(0)
print("Column A with NaN filled:\n", filled_a)

# Numpy
print("\n--- Numpy ---")
np_arr = np.array([]) # <-- Note that you need to provide the square brackets [] inside the array() function
print(np_arr)


--- Strings ---
N/A

--- List ---
[1, 1, 2, 3, 5, 4, 6]
-8
[1, 0, 2, -1, -6, -2, -8]
[1, -1, 2, -3, -5, 4]
4
-1.1428571428571428

--- Tuple ---
(1, 1, 2, 3, 5, 4, 6)
-8
(1, 0, 2, -1, -6, -2, -8)
(-2, 3, -5, -2, 9, -10)
4
-1.1428571428571428

--- Set ---
Absolute values set: {1, 2, 3, 5, 8}
Sum of my_set: 15
Cumulative sums: [1, 3, 7, 14, 25]
Differences between consecutive elements: [1, 2, 3, 4]
Differences between consecutive dictionary values: {3: 2, 6: 3, 10: 4}
Max value in st: 8
Average of num_set: 25.0

--- Dictionary ---
Dictionary with absolute values: {'a': 3, 'b': 5, 'c': 7, 'd': 8}
Sum of my_dict values: 10
Cumulative sum of values in my_dict: [2, 5, 10]
Dictionary with cumulative sums: {'x': 2, 'y': 5, 'z': 10}
Max value in dct: 8
Average of num_dict values: 25.0

--- Series ---
Absolute values:
 a    10
b    20
c    15
d    25
e    30
dtype: int64
Sum of elements: 10
Cumulative sum:
 a    10
b   -10
c     5
d   -20
e    10
dtype: int64
Differences between consecutive elem

  pct_change = df.pct_change(periods=4)


* Create, empty; Check for Empty; Size ✅
* Initialization ✅
* Adding Elements ✅
* Removing elements ✅
* Replace value ✅
* Renaming index ✅
* Create a copy ✅
* Check if element exists ✅
* Check index of element, column and row ✅
* Counting occurrences of specific values ✅
* Iterate over ✅
* Sorting ✅
* Filtering ✅
* Selecting ✅
* Remove duplicates ✅
* Conversions ䷄
* Change Datatype ✅
* Transposing ✅
* +, -, *, / Based on key/index/location ✅
* Ranking ✅
* 5 Random value generation -> import pandas, numpy, string, random ✅
* Math on whole Abs Sum CummSum Diff Max Mean Median ✅

## From Kanchi's doc

In [28]:
# merge (a database/SQL-like join operation) [joins on common columns by default]
# On Index
# Create two DataFrames with some common columns
df1 = pd.DataFrame({
    'ID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Score': [85, 90, 88, 92]
})

df2 = pd.DataFrame({
    'ID': [3, 4, 5, 6],
    'Name': ['Charlie', 'David', 'Eve', 'Frank'],
    'Age': [23, 34, 45, 36]
})
# Set 'ID' as the index for both DataFrames
df1.set_index('ID', inplace=True)
df2.set_index('ID', inplace=True)

# Merge using the indexes
df_new = pd.merge(left=df1, right=df2, how='outer', left_index=True, right_index=True) # -> inner/left/right
print(df_new)

# On column 
df_new = pd.merge(left=df1, right=df2, how='left', left_on='ID', right_on='ID')
print(df_new)
# Trap: When joining on columns, the indexes on the passed DataFrames are ignored. 


     Name_x  Score   Name_y   Age
ID                               
1     Alice   85.0      NaN   NaN
2       Bob   90.0      NaN   NaN
3   Charlie   88.0  Charlie  23.0
4     David   92.0    David  34.0
5       NaN    NaN      Eve  45.0
6       NaN    NaN    Frank  36.0
     Name_x  Score   Name_y   Age
ID                               
1     Alice     85      NaN   NaN
2       Bob     90      NaN   NaN
3   Charlie     88  Charlie  23.0
4     David     92    David  34.0


In [29]:
# Define df1 and df2
df1 = pd.DataFrame({
    'ID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Score': [85, 90, 88, 92]
}).set_index('ID')  # Setting 'ID' as the index to use join on index

df2 = pd.DataFrame({
    'ID': [3, 4, 5, 6],
    'Name': ['Charlie', 'David', 'Eve', 'Frank'],
    'Age': [23, 34, 45, 36]
}).set_index('ID')  # Setting 'ID' as the index to use join on index

# Outer join on the index
df_new = df1.join(df2, how='outer', lsuffix='_left', rsuffix='_right')
print(df_new)

# Reset indexes so 'ID' is a column, and merge on both 'ID' and 'Name'
df1_reset = df1.reset_index()
df2_reset = df2.reset_index()

# Merge on both 'ID' and 'Name' columns
df_new = pd.merge(df1_reset, df2_reset, how='outer', on=['ID', 'Name'])
print(df_new)


   Name_left  Score Name_right   Age
ID                                  
1      Alice   85.0        NaN   NaN
2        Bob   90.0        NaN   NaN
3    Charlie   88.0    Charlie  23.0
4      David   92.0      David  34.0
5        NaN    NaN        Eve  45.0
6        NaN    NaN      Frank  36.0
   ID     Name  Score   Age
0   1    Alice   85.0   NaN
1   2      Bob   90.0   NaN
2   3  Charlie   88.0  23.0
3   4    David   92.0  34.0
4   5      Eve    NaN  45.0
5   6    Frank    NaN  36.0


In [30]:
# Define example DataFrames
df1 = pd.DataFrame({
    'ID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Score': [85, 90, 88, 92]
})

df2 = pd.DataFrame({
    'ID': [3, 4, 5, 6],
    'Name': ['Charlie', 'David', 'Eve', 'Frank'],
    'Age': [23, 34, 45, 36]
})

# Concatenate along axis 0 (top/bottom)
df_vertical = pd.concat([df1, df2], axis=0, ignore_index=True)
print(df_vertical)

# Concatenate along axis 1 (left/right)
df_horizontal = pd.concat([df1, df2], axis=1)
print(df_horizontal)


   ID     Name  Score   Age
0   1    Alice   85.0   NaN
1   2      Bob   90.0   NaN
2   3  Charlie   88.0   NaN
3   4    David   92.0   NaN
4   3  Charlie    NaN  23.0
5   4    David    NaN  34.0
6   5      Eve    NaN  45.0
7   6    Frank    NaN  36.0
   ID     Name  Score  ID     Name  Age
0   1    Alice     85   3  Charlie   23
1   2      Bob     90   4    David   34
2   3  Charlie     88   5      Eve   45
3   4    David     92   6    Frank   36
