In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
#import matplotlib.pyplot as plt


# 가상 데이터 생성
data = {
    'TransactionID': range(1, 21),
    'CustomerID': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    'PurchaseAmount': [250, -50, 3000000, 450, 0, 300, 200, 150, -10, 800, 50, 75, 400, np.nan, 600, 1000, 20, 5000, 150, 80],
    'PurchaseDate': pd.date_range(start='2024-01-01', periods=20).tolist(),
    'ProductCategory': ['Electronics', 'Clothing', 'Electronics', 'Home', 'Electronics', 'Home', 'Clothing', 'Home', 'Clothing', 'Electronics', 'Electronics', 'Home', 'Clothing', 'Electronics', 'Home', 'Home', 'Clothing', 'Electronics', 'Home', 'Electronics'],
    'CustomerAge': [25, 35, 45, np.nan, 22, 29, 33, 41, 27, 36, 28, 34, 42, 39, 24, 30, 32, 40, 38, 26],
    'CustomerGender': ['Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', np.nan, 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'],
    'ReviewScore': [5, np.nan, 4, 3, 2, 5, 3, 4, 1, 2, np.nan, 4, 5, 3, 4, np.nan, 1, 5, 2, 4]
}

df = pd.DataFrame(data)
df

Unnamed: 0,TransactionID,CustomerID,PurchaseAmount,PurchaseDate,ProductCategory,CustomerAge,CustomerGender,ReviewScore
0,1,101,250.0,2024-01-01,Electronics,25.0,Male,5.0
1,2,102,-50.0,2024-01-02,Clothing,35.0,Female,
2,3,103,3000000.0,2024-01-03,Electronics,45.0,Female,4.0
3,4,104,450.0,2024-01-04,Home,,Male,3.0
4,5,105,0.0,2024-01-05,Electronics,22.0,Female,2.0
5,6,106,300.0,2024-01-06,Home,29.0,Male,5.0
6,7,107,200.0,2024-01-07,Clothing,33.0,Female,3.0
7,8,108,150.0,2024-01-08,Home,41.0,,4.0
8,9,109,-10.0,2024-01-09,Clothing,27.0,Male,1.0
9,10,110,800.0,2024-01-10,Electronics,36.0,Female,2.0


In [14]:
#결측치 확인
df.isna().sum()

TransactionID      0
CustomerID         0
PurchaseAmount     1
PurchaseDate       0
ProductCategory    0
CustomerAge        1
CustomerGender     1
ReviewScore        3
dtype: int64

In [15]:
# PurchaseAmount, CustomerAge, CustomerGender, ReviewScore 열의 결측치 제거
df = df.dropna()
print(df)

    TransactionID  CustomerID  PurchaseAmount PurchaseDate ProductCategory  \
0               1         101           250.0   2024-01-01     Electronics   
2               3         103       3000000.0   2024-01-03     Electronics   
4               5         105             0.0   2024-01-05     Electronics   
5               6         106           300.0   2024-01-06            Home   
6               7         107           200.0   2024-01-07        Clothing   
8               9         109           -10.0   2024-01-09        Clothing   
9              10         110           800.0   2024-01-10     Electronics   
11             12         102            75.0   2024-01-12            Home   
12             13         103           400.0   2024-01-13        Clothing   
14             15         105           600.0   2024-01-15            Home   
16             17         107            20.0   2024-01-17        Clothing   
17             18         108          5000.0   2024-01-18     E

In [18]:
# PurchaseAmount의 음수 값 제거
df = df[df['PurchaseAmount'] >= 0]

# PurchaseAmount의 비정상적으로 큰 값 제거 
df = df[df['PurchaseAmount'] < 1000000] 

# 중복 데이터 제거
df = df.drop_duplicates('TransactionID')

#PurchaseDate 열의 데이터 타입을 날짜 형식으로 변환
df['PurchaseDate'] = pd.to_datetime(df['PurchaseDate'])

#PurchaseAmount 열을 정규화
scaler = MinMaxScaler()
df['PurchaseAmount'] = scaler.fit_transform(df[['PurchaseAmount']])

#ProductCategory와 CustomerGender 열을 인코딩
encoder = LabelEncoder()
df['ProductCategory'] = encoder.fit_transform(df['ProductCategory'])
df['CustomerGender'] = encoder.fit_transform(df['CustomerGender'])

#데이터를 무작위로 5개 샘플링
sample_df = df.sample(n=5, random_state=42)
df

Unnamed: 0,TransactionID,CustomerID,PurchaseAmount,PurchaseDate,ProductCategory,CustomerAge,CustomerGender,ReviewScore
0,1,101,0.05,2024-01-01,1,25.0,1,5.0
4,5,105,0.0,2024-01-05,1,22.0,0,2.0
5,6,106,0.06,2024-01-06,2,29.0,1,5.0
6,7,107,0.04,2024-01-07,0,33.0,0,3.0
9,10,110,0.16,2024-01-10,1,36.0,0,2.0
11,12,102,0.015,2024-01-12,2,34.0,0,4.0
12,13,103,0.08,2024-01-13,0,42.0,1,5.0
14,15,105,0.12,2024-01-15,2,24.0,1,4.0
16,17,107,0.004,2024-01-17,0,32.0,1,1.0
17,18,108,1.0,2024-01-18,1,40.0,0,5.0
