<a href="https://colab.research.google.com/github/spencer18001/Clustering-And-Dimensionality-Reduction---Deep-Dive/blob/main/03/0320.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Usefull functionalities

### Lecture agenda

- Head and tail
- Handling missing values
- Handling duplicates
- Droping rows/columns
- Sorting
- Renaming rows/columns
- Mapping

In [None]:
import pandas as pd
import numpy as np

#### Head and tail

In [None]:
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 7, 8, 9, 10],
    'C': ['a', 'b', 'c', 'd', 'e'],
    'D': [100, 200, 300, 400, 500]
})
df

Unnamed: 0,A,B,C,D
0,1.0,,a,100
1,2.0,7.0,b,200
2,,8.0,c,300
3,4.0,9.0,d,400
4,5.0,10.0,e,500


In [None]:
df.head(2) # !!!!!

Unnamed: 0,A,B,C,D
0,1.0,,a,100
1,2.0,7.0,b,200


In [None]:
df.tail(2) # !!!!!

Unnamed: 0,A,B,C,D
3,4.0,9.0,d,400
4,5.0,10.0,e,500


#### Missing values in dataframes

In [None]:
# None can represent missing value
var = None

# Can be detected with ==
print(var == None)

# Can be detected with is None
print(var is None)

True
True


In [None]:
# np.nan is also used to represent missing values
var = np.nan

print(var==np.nan) # !!!!!
print(var is np.nan) # !!!!!
print(np.isnan(var)) # !!!!!

False
True
True


#### Handling missing values

In [None]:
# Create a DataFrame with missing values
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 7, 8, 9, 10],
    'C': [np.nan, 12, None, 14, 15]
})
df

Unnamed: 0,A,B,C
0,1.0,,
1,2.0,7.0,12.0
2,,8.0,
3,4.0,9.0,14.0
4,5.0,10.0,15.0


In [None]:
# Check for missing values
df.isna() # !!!!!

Unnamed: 0,A,B,C
0,False,True,True
1,False,False,False
2,True,False,True
3,False,False,False
4,False,False,False


In [None]:
# Count number of missing values in columns
df.isna().sum(axis=0)

A    1
B    1
C    2
dtype: int64

In [None]:
# Remove rows containing missing values
df.dropna(axis=0)

Unnamed: 0,A,B,C
1,2.0,7.0,12.0
3,4.0,9.0,14.0
4,5.0,10.0,15.0


In [None]:
# Remove rows containing missing values
df.dropna(ignore_index=True) # !!!!!

Unnamed: 0,A,B,C
0,2.0,7.0,12.0
1,4.0,9.0,14.0
2,5.0,10.0,15.0


In [None]:
# Remove only rows where certain columns have missing values
df = pd.DataFrame({
    'A': [1, 2, 3, 4, np.nan],
    'B': [np.nan, 7, 8, 9, 10],
    'C': [11, 12, np.nan, 14, 15]
})

print('Original : ')
display(df)

print('After dropna : ')
df = df.dropna(subset=['A', 'B']) # !!!!!
df

Original : 


Unnamed: 0,A,B,C
0,1.0,,11.0
1,2.0,7.0,12.0
2,3.0,8.0,
3,4.0,9.0,14.0
4,,10.0,15.0


After dropna : 


Unnamed: 0,A,B,C
1,2.0,7.0,12.0
2,3.0,8.0,
3,4.0,9.0,14.0


In [None]:
# Remove cols containing missing values
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 7, 8, 9, 10],
    'C': [11, 12, np.nan, 14, 15],
    'D': [100, 200, 300, 400, 500]
})

print('Original : ')
display(df)

print('After dropna : ')
df = df.dropna(axis=1)
df

Original : 


Unnamed: 0,A,B,C,D
0,1.0,,11.0,100
1,2.0,7.0,12.0,200
2,,8.0,,300
3,4.0,9.0,14.0,400
4,5.0,10.0,15.0,500


After dropna : 


Unnamed: 0,D
0,100
1,200
2,300
3,400
4,500


#### Handling duplicate values

In [None]:
import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Eva', 'Frank', 'Grace', 'Alice'],
    'Age': [20, 19, 21, 20, 22, 21, 19, 20],
    'Subject': ['Math', 'Physics', 'Chemistry', 'Math', 'Physics', 'Chemistry', 'Math', 'Physics']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Subject
0,Alice,20,Math
1,Bob,19,Physics
2,Charlie,21,Chemistry
3,Alice,20,Math
4,Eva,22,Physics
5,Frank,21,Chemistry
6,Grace,19,Math
7,Alice,20,Physics


In [None]:
# keep {‘first’, ‘last’, False}, default ‘first’
# first : Mark duplicates as True except for the first occurrence.
df.duplicated() # !!!!!

0    False
1    False
2    False
3     True
4    False
5    False
6    False
7    False
dtype: bool

In [None]:
# False : Mark all duplicates as True.
df.duplicated(keep=False) # !!!!!

0     True
1    False
2    False
3     True
4    False
5    False
6    False
7    False
dtype: bool

In [None]:
df.duplicated(subset=['Name']) # !!!!!

0    False
1    False
2    False
3     True
4    False
5    False
6    False
7     True
dtype: bool

In [None]:
df.drop_duplicates()

Unnamed: 0,Name,Age,Subject
0,Alice,20,Math
1,Bob,19,Physics
2,Charlie,21,Chemistry
4,Eva,22,Physics
5,Frank,21,Chemistry
6,Grace,19,Math
7,Alice,20,Physics


In [None]:
df.drop_duplicates(keep=False) # !!!!!

Unnamed: 0,Name,Age,Subject
1,Bob,19,Physics
2,Charlie,21,Chemistry
4,Eva,22,Physics
5,Frank,21,Chemistry
6,Grace,19,Math
7,Alice,20,Physics


In [None]:
df.drop_duplicates(subset=['Name']) # !!!!!

Unnamed: 0,Name,Age,Subject
0,Alice,20,Math
1,Bob,19,Physics
2,Charlie,21,Chemistry
4,Eva,22,Physics
5,Frank,21,Chemistry
6,Grace,19,Math


In [None]:
# Drop and reset index
df.drop_duplicates(ignore_index=True) # !!!!!

Unnamed: 0,Name,Age,Subject
0,Alice,20,Math
1,Bob,19,Physics
2,Charlie,21,Chemistry
3,Eva,22,Physics
4,Frank,21,Chemistry
5,Grace,19,Math
6,Alice,20,Physics


#### Drop columns / rows

In [None]:
data = {
    'P': [1, 2, 3, 4, 5],
    'b': ['a', 'b', 'c', 'd', 'e'],
    'C': [1.1, 2.2, 3.3, 4.4, 5.5],
    'D': [10, 20, 30, 40, 50],
    'E': ['x', 'y', 'z', 'xx', 'yy'],
}

# Create a pandas DataFrame from the dictionary
df = pd.DataFrame(data)
df

Unnamed: 0,P,b,C,D,E
0,1,a,1.1,10,x
1,2,b,2.2,20,y
2,3,c,3.3,30,z
3,4,d,4.4,40,xx
4,5,e,5.5,50,yy


In [None]:
# Drop columns by name
df = df.drop(columns=['P']) # !!!!!
df

Unnamed: 0,b,C,D,E
0,a,1.1,10,x
1,b,2.2,20,y
2,c,3.3,30,z
3,d,4.4,40,xx
4,e,5.5,50,yy


In [None]:
# Drop rows by name
df = df.drop(index=[2]) # !!!!!
df

Unnamed: 0,b,C,D,E
0,a,1.1,10,x
1,b,2.2,20,y
3,d,4.4,40,xx
4,e,5.5,50,yy


In [None]:
# Select columns to keep
df = df.filter(['b', 'C'], axis=1) # !!!!!
df

Unnamed: 0,b,C
0,a,1.1
1,b,2.2
3,d,4.4
4,e,5.5


In [None]:
df[['b', 'C']] # 感覺 filter 沒啥作用

Unnamed: 0,b,C
0,a,1.1
1,b,2.2
3,d,4.4
4,e,5.5


In [None]:
# Select rows to keep
df = df.filter([0, 3], axis=0)
df

Unnamed: 0,b,C
0,a,1.1
3,d,4.4


#### Sorting

In [None]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Harry'],
    'Age': [22, 19, 21, 20, 22, 21, 19, 20],
    'Grade': [85, 95, 77, 88, 92, 76, 99, 89],
    'Subject': ['Math', 'Physics', 'Chemistry', 'Math', 'Physics', 'Chemistry', 'Math', 'Physics']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Grade,Subject
0,Alice,22,85,Math
1,Bob,19,95,Physics
2,Charlie,21,77,Chemistry
3,David,20,88,Math
4,Eva,22,92,Physics
5,Frank,21,76,Chemistry
6,Grace,19,99,Math
7,Harry,20,89,Physics


In [None]:
# Sort by index
df.sort_index()

Unnamed: 0,Name,Age,Grade,Subject
0,Alice,22,85,Math
1,Bob,19,95,Physics
2,Charlie,21,77,Chemistry
3,David,20,88,Math
4,Eva,22,92,Physics
5,Frank,21,76,Chemistry
6,Grace,19,99,Math
7,Harry,20,89,Physics


In [None]:
# Reverse order
df.sort_index(ascending=False) # !!!!!

Unnamed: 0,Name,Age,Grade,Subject
7,Harry,20,89,Physics
6,Grace,19,99,Math
5,Frank,21,76,Chemistry
4,Eva,22,92,Physics
3,David,20,88,Math
2,Charlie,21,77,Chemistry
1,Bob,19,95,Physics
0,Alice,22,85,Math


In [None]:
# Sort by valus
df.sort_values(by='Age')

Unnamed: 0,Name,Age,Grade,Subject
1,Bob,19,95,Physics
6,Grace,19,99,Math
3,David,20,88,Math
7,Harry,20,89,Physics
2,Charlie,21,77,Chemistry
5,Frank,21,76,Chemistry
0,Alice,22,85,Math
4,Eva,22,92,Physics


In [None]:
# Sort by value
df.sort_values(by='Age', ignore_index=True) # !!!!!

Unnamed: 0,Name,Age,Grade,Subject
0,Bob,19,95,Physics
1,Grace,19,99,Math
2,David,20,88,Math
3,Harry,20,89,Physics
4,Charlie,21,77,Chemistry
5,Frank,21,76,Chemistry
6,Alice,22,85,Math
7,Eva,22,92,Physics


In [None]:
# Sort by value descending
df.sort_values(by='Age', ascending=False)

Unnamed: 0,Name,Age,Grade,Subject
0,Alice,22,85,Math
4,Eva,22,92,Physics
2,Charlie,21,77,Chemistry
5,Frank,21,76,Chemistry
3,David,20,88,Math
7,Harry,20,89,Physics
1,Bob,19,95,Physics
6,Grace,19,99,Math


In [None]:
df.sort_values(by=['Age', 'Grade']) # !!!!!

Unnamed: 0,Name,Age,Grade,Subject
1,Bob,19,95,Physics
6,Grace,19,99,Math
3,David,20,88,Math
7,Harry,20,89,Physics
5,Frank,21,76,Chemistry
2,Charlie,21,77,Chemistry
0,Alice,22,85,Math
4,Eva,22,92,Physics


#### Rename columns / rows

In [None]:
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 7, 8, 9, 10],
    'C': ['a', 'b', 'c', 'd', 'e'],
    'D': [100, 200, 300, 400, 500]
})
df

Unnamed: 0,A,B,C,D
0,1.0,,a,100
1,2.0,7.0,b,200
2,,8.0,c,300
3,4.0,9.0,d,400
4,5.0,10.0,e,500


In [None]:
# Rename columns
df = df.rename(mapper={'A': 'P', 'B':'b'}, axis=1) # !!!!!
df

Unnamed: 0,P,b,C,D
0,1.0,,a,100
1,2.0,7.0,b,200
2,,8.0,c,300
3,4.0,9.0,d,400
4,5.0,10.0,e,500


In [None]:
# Rename rows
df = df.rename(mapper={0: 'zero', 1: 'one'}, axis=0)
df

Unnamed: 0,P,b,C,D
zero,1.0,,a,100
one,2.0,7.0,b,200
2,,8.0,c,300
3,4.0,9.0,d,400
4,5.0,10.0,e,500


#### Mapping

In [None]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Harry'],
    'Age': [22, 19, 21, 20, 22, 21, 19, 20],
    'Grade': [20, 95, 77, 88, 30, 76, 99, 89],
    'Subject': ['Math', 'Physics', 'Chemistry', 'Math', 'Physics', 'Chemistry', 'Math', 'Physics']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Grade,Subject
0,Alice,22,20,Math
1,Bob,19,95,Physics
2,Charlie,21,77,Chemistry
3,David,20,88,Math
4,Eva,22,30,Physics
5,Frank,21,76,Chemistry
6,Grace,19,99,Math
7,Harry,20,89,Physics


In [None]:
map_dict = {
    'Math' : 'M102',
    'Physics': 'P102',
    'Chemistry': 'C102'
}

In [None]:
df['Subject'] = df['Subject'].map(map_dict) # !!!!!
df

Unnamed: 0,Name,Age,Grade,Subject
0,Alice,22,20,M102
1,Bob,19,95,P102
2,Charlie,21,77,C102
3,David,20,88,M102
4,Eva,22,30,P102
5,Frank,21,76,C102
6,Grace,19,99,M102
7,Harry,20,89,P102


In [None]:
def check_grade(grade):
    if grade > 50:
        return 'passed'
    else:
        return 'failed'

In [None]:
df['Status_1'] = df['Grade'].map(check_grade) # !!!!!
df

Unnamed: 0,Name,Age,Grade,Subject,Status_1
0,Alice,22,20,M102,failed
1,Bob,19,95,P102,passed
2,Charlie,21,77,C102,passed
3,David,20,88,M102,passed
4,Eva,22,30,P102,failed
5,Frank,21,76,C102,passed
6,Grace,19,99,M102,passed
7,Harry,20,89,P102,passed


In [None]:
df['Status'] = df['Grade'].map(lambda x: 'passed' if x > 50 else 'failed')
df

Unnamed: 0,Name,Age,Grade,Subject,Status_1,Status
0,Alice,22,20,M102,failed,failed
1,Bob,19,95,P102,passed,passed
2,Charlie,21,77,C102,passed,passed
3,David,20,88,M102,passed,passed
4,Eva,22,30,P102,failed,failed
5,Frank,21,76,C102,passed,passed
6,Grace,19,99,M102,passed,passed
7,Harry,20,89,P102,passed,passed
