In [13]:
# Exercise 1: Identifying and Handling Missing Data
import pandas as pd

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

print("data sebelum cleaning:")
print(df)

data sebelum cleaning:
      Name   Age   Salary
0    Alice  24.0  48000.0
1      Bob  30.0      NaN
2  Charlie   NaN  57000.0
3    David  22.0      NaN
4     None  35.0  60000.0


In [14]:
# Filling missing values and dropping rows
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)
df.dropna(subset=['Name'], inplace=True)

print('After cleaning:\n', df)

After cleaning:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)


In [15]:
# Exercise 2: Standardizing Categorical Data
# Sample dataset with inconsistent categorical values
data2 = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'Tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadgets', 'gadgets']
}
df2 = pd.DataFrame(data2)
print("data sebelum standardized:")
print(df2)

data sebelum standardized:
   Product     Category
0   Laptop  Electronics
1   Laptop  electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      gadgets


In [16]:
# Standardize category values
df2['Category'] = df2['Category'].str.capitalize()
print('Standardized Data:\n', df2)

Standardized Data:
    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      Gadgets


In [24]:
# task and homework
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

#load dataset
df = pd.read_csv("/home/salwa/Downloads/Titanic-Dataset.csv")
print("5 Baris Pertama Dataset")
print(df.head())

5 Baris Pertama Dataset
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.

In [25]:
# identifikasi missing value

print("Missing Values per Kolom:")
print(df.isnull().sum())

Missing Values per Kolom:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [26]:
# imputasi missing value

# Kolom Age & Fare: median
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# Kolom Embarked: isi mode
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Kolom Cabin: terlalu banyak missing â†’ drop
df = df.drop(columns=['Cabin'])

print("Missing value setelah imputasi")
print(df.isnull().sum())

Missing value setelah imputasi
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [27]:
# handle outliers

def handle_outliers(column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower, upper)

handle_outliers("Age")
handle_outliers("Fare")

In [28]:
# normalisasi numerik

scaler = MinMaxScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

print("Data Numerik Setelah Normalisasi")
print(df[['Age', 'Fare']].head())

Data Numerik Setelah Normalisasi
        Age      Fare
0  0.375000  0.110460
1  0.682692  1.000000
2  0.451923  0.120745
3  0.625000  0.809027
4  0.625000  0.122649


In [29]:
# standardisasi kategorikal

df['Sex'] = df['Sex'].str.lower()
df['Embarked'] = df['Embarked'].str.upper()
df['Pclass'] = df['Pclass'].astype(str)

# One-hot encoding
df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Pclass'], drop_first=True)

In [30]:
# remove duplikat

before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]

print(f"\nBaris sebelum remove duplicates: {before}")
print(f"Baris setelah remove duplicates: {after}")


Baris sebelum remove duplicates: 891
Baris setelah remove duplicates: 891


In [32]:
print("NFO DATASET FINAL")
print(df.info())

print("\n5 Baris Pertama Data akhir")
print(df.head())

NFO DATASET FINAL
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Name         891 non-null    object 
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Ticket       891 non-null    object 
 7   Fare         891 non-null    float64
 8   Sex_male     891 non-null    bool   
 9   Embarked_Q   891 non-null    bool   
 10  Embarked_S   891 non-null    bool   
 11  Pclass_2     891 non-null    bool   
 12  Pclass_3     891 non-null    bool   
dtypes: bool(5), float64(2), int64(4), object(2)
memory usage: 60.2+ KB
None

5 Baris Pertama Data akhir
   PassengerId  Survived                                               Name  \
0            1         0                            Braund, Mr.