## Membaca file dengan menggunakan pandas

In [4]:
import pandas as pd

csv_data = pd.read_csv("https://dqlab-dataset.s3-ap-southeast-1.amazonaws.com/shopping_data.csv")

print(csv_data)

     CustomerID   Genre  Age  Annual Income (k$)  Spending Score (1-100)
0             1    Male   19                  15                      39
1             2    Male   21                  15                      81
2             3  Female   20                  16                       6
3             4  Female   23                  16                      77
4             5  Female   31                  17                      40
..          ...     ...  ...                 ...                     ...
195         196  Female   35                 120                      79
196         197  Female   45                 126                      28
197         198    Male   32                 126                      74
198         199    Male   32                 137                      18
199         200    Male   30                 137                      83

[200 rows x 5 columns]


## Membaca file dengan menggunakan head()

In [6]:
print(csv_data.head())

   CustomerID   Genre  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40


## Melakukan akses data kolom

In [10]:
print(csv_data.columns)

Index(['CustomerID', 'Genre', 'Age', 'Annual Income (k$)',
       'Spending Score (1-100)'],
      dtype='object')


In [13]:
print(csv_data['Age'])

0      19
1      21
2      20
3      23
4      31
       ..
195    35
196    45
197    32
198    32
199    30
Name: Age, Length: 200, dtype: int64


## Melakukan akses data melalui baris

In [14]:
print(csv_data.iloc[5])

CustomerID                     6
Genre                     Female
Age                           22
Annual Income (k$)            17
Spending Score (1-100)        76
Name: 5, dtype: object


## Menampilkan suatu data dari baris dan kolom tertentu

In [15]:
print(csv_data['Age'].iloc[1]) ## mengambil nilai pada kolom Age baris pertama
print("Cuplikan Dataset")
print(csv_data.head())

21
Cuplikan Dataset
   CustomerID   Genre  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40


## Menampilkan data dalam range tertentu

In [17]:
print("Menampilkan data ke 5 sampai dari kurang 10")
print(csv_data.iloc[5:10])

Menampilkan data ke 5 sampai dari kurang 10
   CustomerID   Genre  Age  Annual Income (k$)  Spending Score (1-100)
5           6  Female   22                  17                      76
6           7  Female   35                  18                       6
7           8  Female   23                  18                      94
8           9    Male   64                  19                       3
9          10  Female   30                  19                      72


## Menampilkan informasi statistik dengan Numpy

In [22]:
#digunakan  exclude=[‘O’], dimana fungsi itu akan mengabaikan data yang non-numerical untuk diproses.
print(csv_data.describe(exclude=['O']))

       CustomerID         Age  Annual Income (k$)  Spending Score (1-100)
count  200.000000  200.000000          200.000000              200.000000
mean   100.500000   38.850000           60.560000               50.200000
std     57.879185   13.969007           26.264721               25.823522
min      1.000000   18.000000           15.000000                1.000000
25%     50.750000   28.750000           41.500000               34.750000
50%    100.500000   36.000000           61.500000               50.000000
75%    150.250000   49.000000           78.000000               73.000000
max    200.000000   70.000000          137.000000               99.000000


## Melakukan pengecekan untuk nilai NULL yang ada

In [27]:
# Hasil false karna pada pada dataset tidak ada NULL
print(csv_data.isnull().values.any())

False


## Mengisi dengan mean (data missing)

In [14]:
import pandas as pd
csv_data = pd.read_csv("https://dqlab-dataset.s3-ap-southeast-1.amazonaws.com/shopping_data_missingvalue.csv")
print(csv_data.mean())

CustomerID                100.500000
Age                        38.939698
Annual Income (k$)         61.005051
Spending Score (1-100)     50.489899
dtype: float64


In [15]:
print("Dataset yang masih terdapat nilai kosong!")
print(csv_data.head(10))

Dataset yang masih terdapat nilai kosong!
   CustomerID   Genre   Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male  19.0                15.0                    39.0
1           2    Male   NaN                15.0                    81.0
2           3  Female  20.0                 NaN                     6.0
3           4  Female  23.0                16.0                    77.0
4           5  Female  31.0                17.0                     NaN
5           6  Female  22.0                 NaN                    76.0
6           7  Female  35.0                18.0                     6.0
7           8  Female  23.0                18.0                    94.0
8           9    Male  64.0                19.0                     NaN
9          10  Female  30.0                19.0                    72.0


In [13]:
csv_data = csv_data.fillna(csv_data.mean())
print("Dataset yang sdh diproses Handling Missing Values dengan Mean")
print(csv_data.head(10))

Dataset yang sdh diproses Handling Missing Values dengan Mean
   CustomerID   Genre        Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male  19.000000           15.000000               39.000000
1           2    Male  38.939698           15.000000               81.000000
2           3  Female  20.000000           61.005051                6.000000
3           4  Female  23.000000           16.000000               77.000000
4           5  Female  31.000000           17.000000               50.489899
5           6  Female  22.000000           61.005051               76.000000
6           7  Female  35.000000           18.000000                6.000000
7           8  Female  23.000000           18.000000               94.000000
8           9    Male  64.000000           19.000000               50.489899
9          10  Female  30.000000           19.000000               72.000000


## Mengisi dengan median (data missing)

In [17]:
print(csv_data.median())

CustomerID                100.5
Age                        36.0
Annual Income (k$)         62.0
Spending Score (1-100)     50.0
dtype: float64


In [19]:
print("dataset yang masih terdapat nilai kosong")
print(csv_data.head(10))

dataset yang masih terdapat nilai kosong
   CustomerID   Genre   Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male  19.0                15.0                    39.0
1           2    Male   NaN                15.0                    81.0
2           3  Female  20.0                 NaN                     6.0
3           4  Female  23.0                16.0                    77.0
4           5  Female  31.0                17.0                     NaN
5           6  Female  22.0                 NaN                    76.0
6           7  Female  35.0                18.0                     6.0
7           8  Female  23.0                18.0                    94.0
8           9    Male  64.0                19.0                     NaN
9          10  Female  30.0                19.0                    72.0


In [20]:
csv_data = csv_data.fillna(csv_data.median())
print("dataset yang sudah di Handling Missing Values dengan Median")
print(csv_data.head(10))

dataset yang sudah di Handling Missing Values dengan Median
   CustomerID   Genre   Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male  19.0                15.0                    39.0
1           2    Male  36.0                15.0                    81.0
2           3  Female  20.0                62.0                     6.0
3           4  Female  23.0                16.0                    77.0
4           5  Female  31.0                17.0                    50.0
5           6  Female  22.0                62.0                    76.0
6           7  Female  35.0                18.0                     6.0
7           8  Female  23.0                18.0                    94.0
8           9    Male  64.0                19.0                    50.0
9          10  Female  30.0                19.0                    72.0


## Normalisasi Data