# **1.Import Library**

In [4]:
import pandas as pd
from google.colab import files

## 2.Upload Dataset ke Colab

In [5]:
uploaded = files.upload()

Saving retail_sales_dataset.csv to retail_sales_dataset (1).csv


# 3.Membaca Dataset

In [6]:
df = pd.read_csv("retail_sales_dataset.csv")
print("Data Awal:")
print(df.head())

Data Awal:
   Transaction ID        Date Customer ID  Gender  Age Product Category  \
0               1  2023-11-24     CUST001    Male   34           Beauty   
1               2  2023-02-27     CUST002  Female   26         Clothing   
2               3  2023-01-13     CUST003    Male   50      Electronics   
3               4  2023-05-21     CUST004    Male   37         Clothing   
4               5  2023-05-06     CUST005    Male   30           Beauty   

   Quantity  Price per Unit  Total Amount  
0         3              50           150  
1         2             500          1000  
2         1              30            30  
3         1             500           500  
4         2              50           100  


# 4.Mengecek Informasi Dataset

In [7]:
print("\nInfo Dataset:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())


Info Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    1000 non-null   int64 
 1   Date              1000 non-null   object
 2   Customer ID       1000 non-null   object
 3   Gender            1000 non-null   object
 4   Age               1000 non-null   int64 
 5   Product Category  1000 non-null   object
 6   Quantity          1000 non-null   int64 
 7   Price per Unit    1000 non-null   int64 
 8   Total Amount      1000 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 70.4+ KB
None

Missing Values:
Transaction ID      0
Date                0
Customer ID         0
Gender              0
Age                 0
Product Category    0
Quantity            0
Price per Unit      0
Total Amount        0
dtype: int64


# 5.Proses Data Cleansing

In [8]:
# Ubah kolom Date ke datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Hapus duplikasi
df_cleaned = df.drop_duplicates()

# Pastikan nilai numerik tidak negatif
numeric_cols = ['Age', 'Quantity', 'Price per Unit', 'Total Amount']
for col in numeric_cols:
    df_cleaned = df_cleaned[df_cleaned[col] >= 0]

# Standarisasi Gender
df_cleaned['Gender'] = df_cleaned['Gender'].str.capitalize()

# Validasi Total Amount
df_cleaned['Calculated_Total'] = df_cleaned['Quantity'] * df_cleaned['Price per Unit']
df_cleaned['Total_Correct'] = df_cleaned['Calculated_Total'] == df_cleaned['Total Amount']


# 6.Mengecek Data yang Tidak Konsisten

In [9]:
inconsistent_data = df_cleaned[df_cleaned['Total_Correct'] == False]
print("Jumlah data tidak konsisten:", len(inconsistent_data))

Jumlah data tidak konsisten: 0


# 7.Simpan Dataset yang Sudah Dibersihkan

In [10]:
df_cleaned.to_csv("retail_sales_cleaned.csv", index=False)


# 8.Download File Hasil Cleansing

In [11]:
files.download("retail_sales_cleaned.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# 9.Enrichment

In [12]:
# Enrichment: Membuat Age Group
bins = [0, 18, 30, 45, 60, 100]
labels = ['Teen', 'Young Adult', 'Adult', 'Middle Age', 'Senior']
df_cleaned['Age Group'] = pd.cut(df_cleaned['Age'], bins=bins, labels=labels, right=False)

print("\nContoh data dengan Age Group:")
print(df_cleaned[['Customer ID', 'Age', 'Age Group']].head(10))



Contoh data dengan Age Group:
  Customer ID  Age    Age Group
0     CUST001   34        Adult
1     CUST002   26  Young Adult
2     CUST003   50   Middle Age
3     CUST004   37        Adult
4     CUST005   30        Adult
5     CUST006   45   Middle Age
6     CUST007   46   Middle Age
7     CUST008   30        Adult
8     CUST009   63       Senior
9     CUST010   52   Middle Age
