# Proyek Mini Modul 2: Analisis Data Penjualan Ritel

In [2]:
import pandas as pd

In [3]:
# Langkah 1: Memuat Data
# Data didapat dari https://www.kaggle.com/datasets/rohitsahoo/sales-forecasting
try:
    df = pd.read_csv('C:/Users/ASUS/Documents/Road to DA/Dataset Mini Project Modul 2.csv', encoding='latin1')
except FileNotFoundError:
    print("File tidak ditemukan. Pastikan 'Dataset Mini Project Modul 2.csv' ada di folder yang sama.")
    df = pd.DataFrame()
except Exception as e:
    print(f"Gagal memuat data. Error: {e}")
    df = pd.DataFrame()

In [4]:
# Langkah 2: Inspeksi Awal Data
print("--- 5 Baris Data Pertama ---")
print(df.head())
print("\n")

print("--- Statistik Deskriptif (untuk kolom numerik) ---")
print(df.describe())
print("\n")

--- 5 Baris Data Pertama ---
   Row ID        Order ID  Order Date   Ship Date       Ship Mode Customer ID  \
0       1  CA-2017-152156  08/11/2017  11/11/2017    Second Class    CG-12520   
1       2  CA-2017-152156  08/11/2017  11/11/2017    Second Class    CG-12520   
2       3  CA-2017-138688  12/06/2017  16/06/2017    Second Class    DV-13045   
3       4  US-2016-108966  11/10/2016  18/10/2016  Standard Class    SO-20335   
4       5  US-2016-108966  11/10/2016  18/10/2016  Standard Class    SO-20335   

     Customer Name    Segment        Country             City       State  \
0      Claire Gute   Consumer  United States        Henderson    Kentucky   
1      Claire Gute   Consumer  United States        Henderson    Kentucky   
2  Darrin Van Huff  Corporate  United States      Los Angeles  California   
3   Sean O'Donnell   Consumer  United States  Fort Lauderdale     Florida   
4   Sean O'Donnell   Consumer  United States  Fort Lauderdale     Florida   

   Postal Code Region

In [5]:
# Langkah 3: Pembersihan Data (Data Cleaning)
# Menangani Missing Values
print("--- Jumlah Missing Values per Kolom ---")
print(df.isnull().sum())
print("\n")

# Menghapus Missing Values
df = df.dropna()
print(df.isnull().sum())

--- Jumlah Missing Values per Kolom ---
Row ID            0
Order ID          0
Order Date        0
Ship Date         0
Ship Mode         0
Customer ID       0
Customer Name     0
Segment           0
Country           0
City              0
State             0
Postal Code      11
Region            0
Product ID        0
Category          0
Sub-Category      0
Product Name      0
Sales             0
dtype: int64


Row ID           0
Order ID         0
Order Date       0
Ship Date        0
Ship Mode        0
Customer ID      0
Customer Name    0
Segment          0
Country          0
City             0
State            0
Postal Code      0
Region           0
Product ID       0
Category         0
Sub-Category     0
Product Name     0
Sales            0
dtype: int64


In [6]:
# Menangani Data Duplikat
print(f"\n--- Jumlah Baris Duplikat: {df.duplicated().sum()} ---")


--- Jumlah Baris Duplikat: 0 ---


In [7]:
# Mengubah Tipe Data
# 'Order Date' dari object menjadi datetime
df['Order Date'] = pd.to_datetime(df['Order Date'], dayfirst=True)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 9789 entries, 0 to 9799
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Row ID         9789 non-null   int64         
 1   Order ID       9789 non-null   object        
 2   Order Date     9789 non-null   datetime64[ns]
 3   Ship Date      9789 non-null   object        
 4   Ship Mode      9789 non-null   object        
 5   Customer ID    9789 non-null   object        
 6   Customer Name  9789 non-null   object        
 7   Segment        9789 non-null   object        
 8   Country        9789 non-null   object        
 9   City           9789 non-null   object        
 10  State          9789 non-null   object        
 11  Postal Code    9789 non-null   float64       
 12  Region         9789 non-null   object        
 13  Product ID     9789 non-null   object        
 14  Category       9789 non-null   object        
 15  Sub-Category   9789 non-nu

In [8]:
# Langkah 4: Analisis

# 1. Kategori produk ('Category') mana yang memiliki total penjualan ('Sales') tertinggi
sales_per_category = df.groupby('Category')['Sales'].sum().sort_values(ascending=False)
print("\n--- Total Penjualan per Kategori Produk ---")
print(sales_per_category)


--- Total Penjualan per Kategori Produk ---
Category
Technology         825856.1130
Furniture          723538.4757
Office Supplies    703212.8240
Name: Sales, dtype: float64


In [13]:
# 2. Region mana yang memiliki total penjualan ('Sales') tertinggi
sales_per_region = df.groupby('Region')['Sales'].sum().sort_values(ascending=False)
print("\n--- Total Keuntungan per Region ---")
print(sales_per_region)


--- Total Keuntungan per Region ---
Region
West       710219.6845
East       660589.3560
Central    492646.9132
South      389151.4590
Name: Sales, dtype: float64


In [10]:
# 3. Bagaimana tren penjualan ('Sales') dari tahun ke tahun
if 'Order Date' in df.columns and pd.api.types.is_datetime64_any_dtype(df['Order Date']):
    df['Year'] = df['Order Date'].dt.year
    sales_per_year = df.groupby('Year')['Sales'].sum()
    print("\n--- Tren Penjualan Tahunan ---")
    print(sales_per_year)
else:
    print("\nKolom 'Order Date' belum diubah menjadi datetime. Lewati analisis tren tahunan.")

print("\n\n--- Analisis Selesai ---")


--- Tren Penjualan Tahunan ---
Year
2015    479856.2081
2016    454315.9054
2017    597225.4900
2018    721209.8092
Name: Sales, dtype: float64


--- Analisis Selesai ---


Tambahan

In [None]:
# 1. Menyimpan DataFrame yang sudah bersih
df.to_csv('(Cleaned)Dataset Mini Project Modul 2.csv', index=False)

# 2. Menyimpan hasil agregasi yang akan kita gunakan untuk visualisasi
sales_per_category.to_csv('hasil_sales_per_category.csv')
sales_per_region.to_csv('hasil_sales_per_region.csv')
sales_per_year.to_csv('hasil_sales_per_year.csv')

print("\n--- Hasil analisis berhasil disimpan ke file CSV! ---")


--- Hasil analisis berhasil disimpan ke file CSV! ---
