In [3]:
import pandas as pd
import numpy as np

In [8]:
# 1. Membaca Data
df = pd.read_csv('Walmart.csv')
df.head()

Unnamed: 0,invoice_id,Branch,City,category,unit_price,quantity,date,time,payment_method,rating,profit_margin
0,1,WALM003,San Antonio,Health and beauty,$74.69,7.0,05/01/19,13:08:00,Ewallet,9.1,0.48
1,2,WALM048,Harlingen,Electronic accessories,$15.28,5.0,08/03/19,10:29:00,Cash,9.6,0.48
2,3,WALM067,Haltom City,Home and lifestyle,$46.33,7.0,03/03/19,13:23:00,Credit card,7.4,0.33
3,4,WALM064,Bedford,Health and beauty,$58.22,8.0,27/01/19,20:33:00,Ewallet,8.4,0.33
4,5,WALM013,Irving,Sports and travel,$86.31,7.0,08/02/19,10:37:00,Ewallet,5.3,0.48


In [7]:
# 2. Pemeriksaan Awal Data
print("=== Informasi Data ===")
print(df.info())
print("\n=== Statistik Deskriptif ===")
print(df.describe(include='all'))
print("\n=== Nilai Kosong ===")
print(df.isnull().sum())

=== Informasi Data ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10051 entries, 0 to 10050
Data columns (total 11 columns):
invoice_id        10051 non-null int64
Branch            10051 non-null object
City              10051 non-null object
category          10051 non-null object
unit_price        10020 non-null object
quantity          10020 non-null float64
date              10051 non-null object
time              10051 non-null object
payment_method    10051 non-null object
rating            10051 non-null float64
profit_margin     10051 non-null float64
dtypes: float64(3), int64(1), object(7)
memory usage: 589.0+ KB
None

=== Statistik Deskriptif ===
          invoice_id   Branch     City             category unit_price  \
count   10051.000000    10051    10051                10051      10020   
unique           NaN      100       98                    6       1008   
top              NaN  WALM058  Weslaco  Fashion accessories        $63   
freq             NaN      240  

In [10]:
# 3. Membersihkan Data
# Menghapus semua baris record yang hilang
df.dropna(inplace=True)
# Menampilkan data kosong
df.isnull().sum()

invoice_id        0
Branch            0
City              0
category          0
unit_price        0
quantity          0
date              0
time              0
payment_method    0
rating            0
profit_margin     0
dtype: int64

In [12]:
# Membersihkan kolom unit_price
df['unit_price']=df['unit_price'].str.replace('$', '', regex=False).astype(float)

In [16]:
# Menggabungkan tanggal dan waktu
df['datetime']=pd.to_datetime(df['date']+' '+df['time'], format='%d/%m/%y %H:%M:%S')

In [18]:
# Mengubah tipe data kategori
categorical_cols = ['Branch', 'City', 'category', 'payment_method']
df[categorical_cols] = df[categorical_cols].astype('category')

In [19]:
# Memastikan invoice_id sebagai string
df['invoice_id']=df['invoice_id'].astype(str)

In [24]:
# 4. Mengangani Duplikat
print("\nJumlah duplikat sebelumnya : ", df.duplicated().sum())
df.drop_duplicates(inplace=True)

In [26]:
# 5. Transformasi Data
# Membuat kolom total sales
df['total_sales']= df['unit_price'] * df['quantity']

In [27]:
# Membuat lower semua huruf pada kolom
df.columns = df.columns.str.lower()
df.columns

Index(['invoice_id', 'branch', 'city', 'category', 'unit_price', 'quantity',
       'date', 'time', 'payment_method', 'rating', 'profit_margin', 'datetime',
       'total_sales'],
      dtype='object')

In [28]:
# Ekstrak komponen waktu
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.day_name()
df['month'] = df['datetime'].dt.month_name()
df['year'] = df['datetime'].dt.year

In [29]:
# 6. Validasi Data
# Memastikan tidak ada nilai negatif
assert (df['quantity'] >= 0).all(), "Ada quantity negatif"
assert (df['unit_price'] >= 0).all(), "Ada unit_price negatif"

In [30]:
df.head()

Unnamed: 0,invoice_id,branch,city,category,unit_price,quantity,date,time,payment_method,rating,profit_margin,datetime,total_sales,hour,day_of_week,month,year
0,1,WALM003,San Antonio,Health and beauty,74.69,7.0,05/01/19,13:08:00,Ewallet,9.1,0.48,2019-01-05 13:08:00,522.83,13,Saturday,January,2019
1,2,WALM048,Harlingen,Electronic accessories,15.28,5.0,08/03/19,10:29:00,Cash,9.6,0.48,2019-03-08 10:29:00,76.4,10,Friday,March,2019
2,3,WALM067,Haltom City,Home and lifestyle,46.33,7.0,03/03/19,13:23:00,Credit card,7.4,0.33,2019-03-03 13:23:00,324.31,13,Sunday,March,2019
3,4,WALM064,Bedford,Health and beauty,58.22,8.0,27/01/19,20:33:00,Ewallet,8.4,0.33,2019-01-27 20:33:00,465.76,20,Sunday,January,2019
4,5,WALM013,Irving,Sports and travel,86.31,7.0,08/02/19,10:37:00,Ewallet,5.3,0.48,2019-02-08 10:37:00,604.17,10,Friday,February,2019


In [31]:
# 7. Menyimpan Data Bersih
df.to_csv('walmart_clean_data.csv', index=False)