# Data Preparation and Cleaning


In [40]:
import numpy as np
import pandas as pd

In [8]:
df_car = pd.read_csv("dataset/BMW_Car_Sales_Classification.csv")

df_car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Model                 50000 non-null  object 
 1   Year                  50000 non-null  int64  
 2   Region                50000 non-null  object 
 3   Color                 50000 non-null  object 
 4   Fuel_Type             50000 non-null  object 
 5   Transmission          50000 non-null  object 
 6   Engine_Size_L         50000 non-null  float64
 7   Mileage_KM            50000 non-null  int64  
 8   Price_USD             50000 non-null  int64  
 9   Sales_Volume          50000 non-null  int64  
 10  Sales_Classification  50000 non-null  object 
dtypes: float64(1), int64(4), object(6)
memory usage: 4.2+ MB


### Duplicated data


In [4]:
df_car.duplicated().sum()

0

In [7]:
# kalau ada, kita bisa hapus dengan mudah
df_car2 = df_car.drop_duplicates()

df_car2.info()  # data tanpa duplikat

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Model                 50000 non-null  object 
 1   Year                  50000 non-null  int64  
 2   Region                50000 non-null  object 
 3   Color                 50000 non-null  object 
 4   Fuel_Type             50000 non-null  object 
 5   Transmission          50000 non-null  object 
 6   Engine_Size_L         50000 non-null  float64
 7   Mileage_KM            50000 non-null  int64  
 8   Price_USD             50000 non-null  int64  
 9   Sales_Volume          50000 non-null  int64  
 10  Sales_Classification  50000 non-null  object 
dtypes: float64(1), int64(4), object(6)
memory usage: 4.2+ MB


In [14]:
# cek duplikat di beberapa kolom saja, bukan semua kolom

df_car.duplicated(subset=["Model", 'Year', 'Region']).sum()

49010

In [12]:
df_car['Model'].nunique()

11

### Missing values


In [15]:
df_mcd = pd.read_csv("dataset/mcdonalds_dataset.csv")

df_mcd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16671 entries, 0 to 16670
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lat           16671 non-null  float64
 1   lon           16671 non-null  float64
 2   alt           16671 non-null  int64  
 3   is_broken     16671 non-null  bool   
 4   is_active     16671 non-null  bool   
 5   dot           16671 non-null  object 
 6   state         12725 non-null  object 
 7   city          16663 non-null  object 
 8   street        16671 non-null  object 
 9   country       16671 non-null  object 
 10  last_checked  16671 non-null  object 
dtypes: bool(2), float64(2), int64(1), object(6)
memory usage: 1.2+ MB


In [16]:
df_mcd.isnull().sum()

lat                0
lon                0
alt                0
is_broken          0
is_active          0
dot                0
state           3946
city               8
street             0
country            0
last_checked       0
dtype: int64

In [19]:
df_mcd['state'].unique()

array(['NY', 'NJ', 'CT', 'PA', 'DE', 'MA', 'MD', 'RI', 'VT', 'NH', 'DC',
       'VA', 'WV', 'ME', 'NC', 'OH', 'MI', 'KY', 'TN', 'SC', 'IN',
       'Maharastra', 'GA', 'IL', 'WI', 'AL', 'FL', 'IA', 'MO', 'MS', 'AR',
       'MN', 'KS', 'LA', 'NE', 'OK', 'SD', 'ND', 'TX', 'CO', 'MT', 'WY',
       'NM', 'UT', 'AZ', 'ID', 'NV', 'WA', 'OR', 'CA', 'AK', 'HI', nan],
      dtype=object)

In [17]:
df_mcd[df_mcd['state'].isnull()]

Unnamed: 0,lat,lon,alt,is_broken,is_active,dot,state,city,street,country,last_checked
12725,-79.381000,43.654620,0,False,True,working,,Toronto,"(Food Court) Urban Eatery, 260 Yonge Street, U...",CA,Checked 11 minutes ago
12726,-79.378450,43.650890,0,False,True,working,,Toronto,123 Yonge St,CA,Checked 11 minutes ago
12727,-79.381900,43.658210,0,False,True,working,,Toronto,356 Yonge Street,CA,Checked 11 minutes ago
12728,-79.375520,43.653110,0,False,True,working,,Toronto,127 Church Street,CA,Checked 11 minutes ago
12729,-79.378570,43.647080,0,False,True,working,,Toronto,"(Food Court) 181 Bay St, P.O. Box 112",CA,Checked 11 minutes ago
...,...,...,...,...,...,...,...,...,...,...,...
16666,13.475643,52.514265,0,False,False,inactive,,Berlin,Frankfurter Allee 117,DE,Checked 31 minutes ago
16667,13.429812,54.076239,0,False,False,inactive,,Greifswald,Anklamer Landstr. 1,DE,Checked 31 minutes ago
16668,8.787059,53.100934,0,False,False,inactive,,Bremen,Waller Heerstr. 101,DE,Checked 31 minutes ago
16669,11.409059,53.628227,0,False,False,inactive,,Schwerin,Marienplatz 5-7,DE,Checked 31 minutes ago


In [None]:
# untuk toronto kita isi statenya Ontario
df_mcd.loc[df_mcd['city'] == 'Toronto', 'state'] = df_mcd.loc[(
    df_mcd['city'] == 'Toronto', 'state')].fillna('Ontario')

df_mcd.isnull().sum()

lat                0
lon                0
alt                0
is_broken          0
is_active          0
dot                0
state           3901
city               8
street             0
country            0
last_checked       0
dtype: int64

kita gunakan.loc untuk ambil baris2 tertentu di kolom tertentu

```python
df_mcd.loc[df_mcd['city'] == 'Toronto', 'state']
```

disini khusus baris yang city adalah 'Toronto' dan kolom 'state'

setelah itu, kita gunakan .fillna() untuk isi yang value nya missing dengan 'Ontario'

```python
df_mcd.loc[(df_mcd['city'] == 'Toronto', 'state')].fillna('Ontario')
```

terakhir kita assign ke diri sendiri hingga hasil fillna tersimpan


In [99]:
df = pd.DataFrame({
    "name": ["a", "b", "c", "d"],
    "score": [50, np.nan, 80, np.nan]
})

df

Unnamed: 0,name,score
0,a,50.0
1,b,
2,c,80.0
3,d,


In [102]:
df.loc[df['name'] == "b", 'score'] = df.loc[df['name']
                                            == "b", 'score'].fillna(50)

In [103]:
df

Unnamed: 0,name,score
0,a,50.0
1,b,50.0
2,c,80.0
3,d,


### Delete missing values

dropna


In [52]:
df_new = df.dropna()

df_new

Unnamed: 0,name,score
0,a,50.0
2,c,80.0


In [51]:
df

Unnamed: 0,name,score
0,a,50.0
1,b,
2,c,80.0
3,d,


### Tipe data di Pandas DataFrame

- obj (text)
- category (text tapi beberapa jenis saja)
- int, float (numerik)
- datetime (tanggal dan jam)
- timedelta (jumlah waktu)
- bool (True False)


In [53]:
df_academic = pd.read_csv('dataset/academicStress.csv')

df_academic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 9 columns):
 #   Column                                                               Non-Null Count  Dtype 
---  ------                                                               --------------  ----- 
 0   Timestamp                                                            121 non-null    object
 1   Your Academic Stage                                                  121 non-null    object
 2   Peer pressure                                                        121 non-null    int64 
 3   Academic pressure from your home                                     121 non-null    int64 
 4   Study Environment                                                    121 non-null    object
 5   What coping strategy you use as a student?                           121 non-null    object
 6   Do you have any bad habits like smoking, drinking on a daily basis?  121 non-null    object
 7   What would you rat

In [58]:
# contoh, timestamp kita ubah jadi tipe datetime

df_academic['Timestamp'] = df_academic['Timestamp'].astype('datetime64[ns]')

  df_academic['Timestamp'] = df_academic['Timestamp'].astype('datetime64[ns]')


In [59]:
df_academic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 9 columns):
 #   Column                                                               Non-Null Count  Dtype         
---  ------                                                               --------------  -----         
 0   Timestamp                                                            121 non-null    datetime64[ns]
 1   Your Academic Stage                                                  121 non-null    object        
 2   Peer pressure                                                        121 non-null    int64         
 3   Academic pressure from your home                                     121 non-null    int64         
 4   Study Environment                                                    121 non-null    object        
 5   What coping strategy you use as a student?                           121 non-null    object        
 6   Do you have any bad habits like smoking, drinking o

In [63]:
# untuk data yang isinya ada beberapa jenis, kita bisa ubah ke tipe data category
academic_stages = list(df_academic['Your Academic Stage'].unique())
academic_stages

['undergraduate', 'high school', 'post-graduate']

In [65]:
df_academic['Your Academic Stage'] = df_academic['Your Academic Stage'].astype(
    'category')

In [66]:
df_academic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 9 columns):
 #   Column                                                               Non-Null Count  Dtype         
---  ------                                                               --------------  -----         
 0   Timestamp                                                            121 non-null    datetime64[ns]
 1   Your Academic Stage                                                  121 non-null    category      
 2   Peer pressure                                                        121 non-null    int64         
 3   Academic pressure from your home                                     121 non-null    int64         
 4   Study Environment                                                    121 non-null    object        
 5   What coping strategy you use as a student?                           121 non-null    object        
 6   Do you have any bad habits like smoking, drinking o

In [68]:
df_academic['Your Academic Stage'].cat.codes

0      2
1      2
2      2
3      2
4      2
      ..
116    0
117    2
118    0
119    0
120    2
Length: 121, dtype: int8

In [69]:
df

Unnamed: 0,name,score
0,a,50.0
1,b,
2,c,80.0
3,d,


In [70]:
# build new column dengan category

df['grade'] = pd.Categorical(
    ["Pass", "Fail", "Pass", "Pass"], categories=["Pass", "Fail"])

df

Unnamed: 0,name,score,grade
0,a,50.0,Pass
1,b,,Fail
2,c,80.0,Pass
3,d,,Pass


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   name    4 non-null      object  
 1   score   2 non-null      float64 
 2   grade   4 non-null      category
dtypes: category(1), float64(1), object(1)
memory usage: 324.0+ bytes


### Data tipe text

- hapus spasi berlebih dengan .str.strip()
- ubah case CASE .lower() .upper() .title()
- replace sebagian text .replace()
- split text dari pemisah tertentu .split()


In [72]:
df_obat = pd.read_csv("dataset/realistic_drug_labels_side_effects.csv")

df_obat.head()

Unnamed: 0,drug_name,manufacturer,approval_year,drug_class,indications,side_effects,dosage_mg,administration_route,contraindications,warnings,price_usd,batch_number,expiry_date,side_effect_severity,approval_status
0,Seroxetine50,AstraZeneca,1996,Antidepressant,Allergy relief,"Fatigue, Nausea",260,Rectal,Bleeding disorders,Avoid alcohol,192.43,MV388Pl,2026-11-29,Mild,Pending
1,Mecoparin93,AstraZeneca,2018,Vaccine,Allergy relief,Nausea,470,Inhalation,Allergic reaction,Take with food,397.82,UR279ZN,2027-07-14,Mild,Approved
2,Daxozole89,Merck & Co.,1997,Antipsychotic,Allergy relief,"Diarrhea, Blurred vision, Dizziness",330,Sublingual,High blood pressure,Take with food,131.69,we040kH,2028-06-02,Moderate,Pending
3,Viracillin84,Roche Holding AG,2004,Antifungal,Inflammation reduction,"Fatigue, Dry mouth",450,Oral,Kidney impairment,Do not operate machinery,372.82,hO060rh,2026-07-07,Mild,Rejected
4,Amoxstatin62,Pfizer Inc.,2003,Antidepressant,Psychosis control,"Insomnia, Dry mouth, Fatigue",430,Topical,Bleeding disorders,Do not operate machinery,281.48,Fa621Sw,2027-12-28,Moderate,Pending


In [74]:
df_obat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   drug_name             1436 non-null   object 
 1   manufacturer          1436 non-null   object 
 2   approval_year         1436 non-null   int64  
 3   drug_class            1436 non-null   object 
 4   indications           1436 non-null   object 
 5   side_effects          1436 non-null   object 
 6   dosage_mg             1436 non-null   int64  
 7   administration_route  1436 non-null   object 
 8   contraindications     1436 non-null   object 
 10  price_usd             1436 non-null   float64
 11  batch_number          1436 non-null   object 
 12  expiry_date           1436 non-null   object 
 13  side_effect_severity  1436 non-null   object 
 14  approval_status       1436 non-null   object 
dtypes: float64(1), int64(2), object(12)
memory usage: 168.4+ KB


In [76]:
# hapus spasi berlebih di depan dan belakang text
df_obat['warnings'] = df_obat['warnings'].str.strip()

df_obat['warnings']

0                     Avoid alcohol
1                    Take with food
2                    Take with food
3          Do not operate machinery
4          Do not operate machinery
                   ...             
1431           May affect fertility
1432           May affect fertility
1433       Do not operate machinery
1434           May cause drowsiness
1435    Do not discontinue abruptly

In [77]:
# ubah case, misalnya side effects jadi CAPS
df_obat['side_effects'] = df_obat['side_effects'].str.upper()

df_obat.head()

Unnamed: 0,drug_name,manufacturer,approval_year,drug_class,indications,side_effects,dosage_mg,administration_route,contraindications,warnings,price_usd,batch_number,expiry_date,side_effect_severity,approval_status
0,Seroxetine50,AstraZeneca,1996,Antidepressant,Allergy relief,"FATIGUE, NAUSEA",260,Rectal,Bleeding disorders,Avoid alcohol,192.43,MV388Pl,2026-11-29,Mild,Pending
1,Mecoparin93,AstraZeneca,2018,Vaccine,Allergy relief,NAUSEA,470,Inhalation,Allergic reaction,Take with food,397.82,UR279ZN,2027-07-14,Mild,Approved
2,Daxozole89,Merck & Co.,1997,Antipsychotic,Allergy relief,"DIARRHEA, BLURRED VISION, DIZZINESS",330,Sublingual,High blood pressure,Take with food,131.69,we040kH,2028-06-02,Moderate,Pending
3,Viracillin84,Roche Holding AG,2004,Antifungal,Inflammation reduction,"FATIGUE, DRY MOUTH",450,Oral,Kidney impairment,Do not operate machinery,372.82,hO060rh,2026-07-07,Mild,Rejected
4,Amoxstatin62,Pfizer Inc.,2003,Antidepressant,Psychosis control,"INSOMNIA, DRY MOUTH, FATIGUE",430,Topical,Bleeding disorders,Do not operate machinery,281.48,Fa621Sw,2027-12-28,Moderate,Pending


In [79]:
# kita bisa replace sebagian text dengan .replace()
# misalnya, "Anti" kita ubah jadi "A*"

df_obat['drug_class'] = df_obat['drug_class'].str.replace("Anti", "A*")

df_obat.head()

Unnamed: 0,drug_name,manufacturer,approval_year,drug_class,indications,side_effects,dosage_mg,administration_route,contraindications,warnings,price_usd,batch_number,expiry_date,side_effect_severity,approval_status
0,Seroxetine50,AstraZeneca,1996,A*depressant,Allergy relief,"FATIGUE, NAUSEA",260,Rectal,Bleeding disorders,Avoid alcohol,192.43,MV388Pl,2026-11-29,Mild,Pending
1,Mecoparin93,AstraZeneca,2018,Vaccine,Allergy relief,NAUSEA,470,Inhalation,Allergic reaction,Take with food,397.82,UR279ZN,2027-07-14,Mild,Approved
2,Daxozole89,Merck & Co.,1997,A*psychotic,Allergy relief,"DIARRHEA, BLURRED VISION, DIZZINESS",330,Sublingual,High blood pressure,Take with food,131.69,we040kH,2028-06-02,Moderate,Pending
3,Viracillin84,Roche Holding AG,2004,A*fungal,Inflammation reduction,"FATIGUE, DRY MOUTH",450,Oral,Kidney impairment,Do not operate machinery,372.82,hO060rh,2026-07-07,Mild,Rejected
4,Amoxstatin62,Pfizer Inc.,2003,A*depressant,Psychosis control,"INSOMNIA, DRY MOUTH, FATIGUE",430,Topical,Bleeding disorders,Do not operate machinery,281.48,Fa621Sw,2027-12-28,Moderate,Pending


In [82]:
# kita coba split()
# kita pisahkan data side_effects menjadi satu satu

df_obat['side_effects'].str.split(", ")

0                           [FATIGUE, NAUSEA]
1                                    [NAUSEA]
2       [DIARRHEA, BLURRED VISION, DIZZINESS]
3                        [FATIGUE, DRY MOUTH]
4              [INSOMNIA, DRY MOUTH, FATIGUE]
                        ...                  
1431                              [DRY MOUTH]
1432              [NAUSEA, FATIGUE, HEADACHE]
1433                    [INSOMNIA, DIZZINESS]
1434                               [HEADACHE]
1435           [NAUSEA, DRY MOUTH, DIZZINESS]
Name: side_effects, Length: 1436, dtype: object

### Datetime

selain untuk merapikan tipe data jadi tanggal dan waktu, kita bisa:

- extract component tanggal dan waktu
- ubah format tanggal dan waktu


In [83]:
df_academic.head()

Unnamed: 0,Timestamp,Your Academic Stage,Peer pressure,Academic pressure from your home,Study Environment,What coping strategy you use as a student?,"Do you have any bad habits like smoking, drinking on a daily basis?",What would you rate the academic competition in your student life,Rate your academic stress index
0,2025-07-24 22:05:39,undergraduate,4,5,Noisy,Analyze the situation and handle it with intel...,No,3,5
1,2025-07-24 22:05:52,undergraduate,3,4,Peaceful,Analyze the situation and handle it with intel...,No,3,3
2,2025-07-24 22:06:39,undergraduate,1,1,Peaceful,"Social support (friends, family)",No,2,4
3,2025-07-24 22:06:45,undergraduate,3,2,Peaceful,Analyze the situation and handle it with intel...,No,4,3
4,2025-07-24 22:08:06,undergraduate,3,3,Peaceful,Analyze the situation and handle it with intel...,No,4,5


In [84]:
# extract tahun ke kolom baru
df_academic['year'] = df_academic['Timestamp'].dt.year

df_academic.head()

Unnamed: 0,Timestamp,Your Academic Stage,Peer pressure,Academic pressure from your home,Study Environment,What coping strategy you use as a student?,"Do you have any bad habits like smoking, drinking on a daily basis?",What would you rate the academic competition in your student life,Rate your academic stress index,year
0,2025-07-24 22:05:39,undergraduate,4,5,Noisy,Analyze the situation and handle it with intel...,No,3,5,2025
1,2025-07-24 22:05:52,undergraduate,3,4,Peaceful,Analyze the situation and handle it with intel...,No,3,3,2025
2,2025-07-24 22:06:39,undergraduate,1,1,Peaceful,"Social support (friends, family)",No,2,4,2025
3,2025-07-24 22:06:45,undergraduate,3,2,Peaceful,Analyze the situation and handle it with intel...,No,4,3,2025
4,2025-07-24 22:08:06,undergraduate,3,3,Peaceful,Analyze the situation and handle it with intel...,No,4,5,2025


In [91]:
df_academic['year'] = df_academic['Timestamp'].dt.year
df_academic['month'] = df_academic['Timestamp'].dt.month
df_academic['day'] = df_academic['Timestamp'].dt.day  # tanggal

# 0 - sunday, 1 - monday, ...
df_academic['weekday'] = df_academic['Timestamp'].dt.weekday
df_academic['hour'] = df_academic['Timestamp'].dt.hour

df_academic.head(3)

Unnamed: 0,Timestamp,Your Academic Stage,Peer pressure,Academic pressure from your home,Study Environment,What coping strategy you use as a student?,"Do you have any bad habits like smoking, drinking on a daily basis?",What would you rate the academic competition in your student life,Rate your academic stress index,year,month,date,day,hour,weekday
0,2025-07-24 22:05:39,undergraduate,4,5,Noisy,Analyze the situation and handle it with intel...,No,3,5,2025,7,2025-07-24,24,22,3
1,2025-07-24 22:05:52,undergraduate,3,4,Peaceful,Analyze the situation and handle it with intel...,No,3,3,2025,7,2025-07-24,24,22,3
2,2025-07-24 22:06:39,undergraduate,1,1,Peaceful,"Social support (friends, family)",No,2,4,2025,7,2025-07-24,24,22,3


In [93]:
df_academic['Timestamp'].dt.strftime("%A, %d-%b-%y")

0       Thursday, 24-Jul-25
1       Thursday, 24-Jul-25
2       Thursday, 24-Jul-25
3       Thursday, 24-Jul-25
4       Thursday, 24-Jul-25
               ...         
116     Saturday, 26-Jul-25
117       Sunday, 27-Jul-25
118       Sunday, 27-Jul-25
119    Wednesday, 30-Jul-25
120    Wednesday, 30-Jul-25
Name: Timestamp, Length: 121, dtype: object

tipe data datetime juga memungkinkan kita filter data menggunakan komponen waktu

misalnya, ambil data khusus hari kamis


In [95]:
kon_kamis = df_academic['Timestamp'].dt.weekday == 4

df_academic[kon_kamis].info()

<class 'pandas.core.frame.DataFrame'>
Index: 27 entries, 65 to 91
Data columns (total 15 columns):
 #   Column                                                               Non-Null Count  Dtype         
---  ------                                                               --------------  -----         
 0   Timestamp                                                            27 non-null     datetime64[ns]
 1   Your Academic Stage                                                  27 non-null     category      
 2   Peer pressure                                                        27 non-null     int64         
 3   Academic pressure from your home                                     27 non-null     int64         
 4   Study Environment                                                    27 non-null     object        
 5   What coping strategy you use as a student?                           27 non-null     object        
 6   Do you have any bad habits like smoking, drinking on a d

## Latihan

1. Cek duplicate values
2. Handle Missing values (di drop atau di fill)
3. Merapikan tipe data
4. Mencoba mem-format tanggal

Untuk dataset movies, kita coba step 1-3 https://www.kaggle.com/datasets/bharatnatrayn/movies-dataset-for-feature-extracion-prediction?select=movies.csv

kita rapikan datanya dengan tujuan menaikkan kualitas dataset ini
