In [4]:
# Exercise 1: Identifying and Handling Missing Data
import pandas as pd

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

# Filling missing values and dropping rows
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].median())
df.dropna(subset=['Name'], inplace=True)

print('After cleaning:\n', df)

After cleaning:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


In [3]:
# Exercise 2: Standardizing Categorical Data
# Sample dataset with inconsistent categorical values
data = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'Tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadgets', 'gadgets']
}
df = pd.DataFrame(data)

# Standardize category values
df['Category'] = df['Category'].str.capitalize()
print('Standardized Data:\n', df)

Standardized Data:
    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      Gadgets


In [18]:
# load dataset titanic
import pandas as pd

# Load dataset
df = pd.read_csv('Titanic-Dataset.csv')

print("Data Awal:")
print(df.head())

# Cek jumlah missing values per kolom (tanpa tabel)
print("\nJumlah missing values per kolom:")
print(df.isnull().sum())

Data Awal:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN   

In [9]:
# Practice 1 Load a dataset of your choice and identify missing values.
import pandas as pd

# Load dataset
df = pd.read_csv('Titanic-Dataset.csv')

# Menampilkan hanya missing values
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [20]:
# Practice 2 Implement data transformations to normalize numerical columns.
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Isi missing value dengan median dan lakukan normalisasi
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())
    min_val = df[col].min()
    max_val = df[col].max()
    df[col] = (df[col] - min_val) / (max_val - min_val)

# Tampilkan tabel kolom numerik setelah normalisasi
print("Normalized numerical columns:\n", df[numerical_cols].head())

Normalized numerical columns:
    PassengerId  Survived  Pclass       Age  SibSp  Parch      Fare
0     0.000000       0.0     1.0  0.271174  0.125    0.0  0.014151
1     0.001124       1.0     0.0  0.472229  0.125    0.0  0.139136
2     0.002247       1.0     1.0  0.321438  0.000    0.0  0.015469
3     0.003371       1.0     0.0  0.434531  0.125    0.0  0.103644
4     0.004494       0.0     1.0  0.434531  0.000    0.0  0.015713


In [31]:
# Practice 3: Standardize categorical columns and remove duplicates.
import pandas as pd

# Pilih kolom kategorikal (object)
cat_cols = df.select_dtypes(include=['object']).columns

if len(cat_cols) > 0:
    # Standarisasi: ubah semua string menjadi lowercase
    df[cat_cols] = df[cat_cols].apply(lambda x: x.str.lower())
else:
    print()

# Hapus duplikasi
df.drop_duplicates(inplace=True)

print("Standardized categorical columns and removed duplicates.\n")
print(df.head())


Standardized categorical columns and removed duplicates.

   PassengerId  Survived  Pclass  Name  Sex       Age  SibSp  Parch  Ticket  \
0     0.000000       0.0     1.0   108    1  0.271174  0.125    0.0     523   
1     0.001124       1.0     0.0   190    0  0.472229  0.125    0.0     596   
2     0.002247       1.0     1.0   353    0  0.321438  0.000    0.0     669   
3     0.003371       1.0     0.0   272    0  0.434531  0.125    0.0      49   
4     0.004494       0.0     1.0    15    1  0.434531  0.000    0.0     472   

       Fare  Cabin  Embarked  
0  0.014151     47         2  
1  0.139136     81         0  
2  0.015469     47         2  
3  0.103644     55         2  
4  0.015713     47         2  


In [30]:
before = len(df)
df.drop_duplicates(inplace=True)
after = len(df)

print("Jumlah baris sebelum:", before)
print("Jumlah baris sesudah:", after)
print(f"Jumlah duplikat yang dihapus: {before - after}")

Jumlah baris sebelum: 891
Jumlah baris sesudah: 891
Jumlah duplikat yang dihapus: 0
