# Import Package

In [2]:
import os
from pathlib import Path
from faker import Faker

import random

import pandas as pd
import numpy as np

from sklearn import preprocessing

fake = Faker()

import re

# Baca Data 

In [65]:
data = pd.read_csv('data.csv')

In [66]:
data

Unnamed: 0,no,nim,nama,alamat,nilai,tanggal_lahir
0,1,2020TI1,Lisa Watson,Kombeli,60.0,10/22/2002
1,2,2020TI2,Amy Weaver,Dongkala,70.0,12/12/1999
2,3,2020TI3,Julie Murray,Kombeli,59.0,4/7/2003
3,4,2020TI4,RUDI,Takimpo,95.0,2/16/2001
4,5,2020TI5,Kevin Hampton,Dongkala,87.0,3/24/2002
...,...,...,...,...,...,...
105,106,2020TI106,,,68.0,9/25/2001
106,107,2020TI107,Jenna Delacruz,Kombeli,77.0,8/30/1999
107,108,2020TI108,,Dongkala,87.0,9/8/2000
108,109,2020TI109,,Takimpo,60.0,3/24/1999


# Latihan

#### 1. Menghapus Data Null (Kolom Nama, Nilai)

In [67]:
data.dropna(subset=['nama', 'nilai'], inplace=True)

In [68]:
data

Unnamed: 0,no,nim,nama,alamat,nilai,tanggal_lahir
0,1,2020TI1,Lisa Watson,Kombeli,60.0,10/22/2002
1,2,2020TI2,Amy Weaver,Dongkala,70.0,12/12/1999
2,3,2020TI3,Julie Murray,Kombeli,59.0,4/7/2003
3,4,2020TI4,RUDI,Takimpo,95.0,2/16/2001
4,5,2020TI5,Kevin Hampton,Dongkala,87.0,3/24/2002
...,...,...,...,...,...,...
99,100,2020TI100,Shannon Guzman,,70.0,5/28/2004
100,101,2020TI101,Robert Rojas,Takimpo,98.0,12/28/2003
101,102,2020TI102,Andrew Anderson,Takimpo,73.0,9/1/2001
103,104,2020TI104,Daniel Smith,Takimpo,81.0,8/20/2003


In [69]:
data['nilai'].dtype

dtype('float64')

#### 2. Mengubah Tipe Data

In [70]:
data['nilai'] = data['nilai'].astype(int)

In [71]:
data['nilai'].dtype

dtype('int64')

In [72]:
data

Unnamed: 0,no,nim,nama,alamat,nilai,tanggal_lahir
0,1,2020TI1,Lisa Watson,Kombeli,60,10/22/2002
1,2,2020TI2,Amy Weaver,Dongkala,70,12/12/1999
2,3,2020TI3,Julie Murray,Kombeli,59,4/7/2003
3,4,2020TI4,RUDI,Takimpo,95,2/16/2001
4,5,2020TI5,Kevin Hampton,Dongkala,87,3/24/2002
...,...,...,...,...,...,...
99,100,2020TI100,Shannon Guzman,,70,5/28/2004
100,101,2020TI101,Robert Rojas,Takimpo,98,12/28/2003
101,102,2020TI102,Andrew Anderson,Takimpo,73,9/1/2001
103,104,2020TI104,Daniel Smith,Takimpo,81,8/20/2003


#### 3. Mengganti Data Null (Kolom Alamat)

In [73]:
data['alamat'] = data['alamat'].fillna('Pasarwajo')

In [74]:
data

Unnamed: 0,no,nim,nama,alamat,nilai,tanggal_lahir
0,1,2020TI1,Lisa Watson,Kombeli,60,10/22/2002
1,2,2020TI2,Amy Weaver,Dongkala,70,12/12/1999
2,3,2020TI3,Julie Murray,Kombeli,59,4/7/2003
3,4,2020TI4,RUDI,Takimpo,95,2/16/2001
4,5,2020TI5,Kevin Hampton,Dongkala,87,3/24/2002
...,...,...,...,...,...,...
99,100,2020TI100,Shannon Guzman,Pasarwajo,70,5/28/2004
100,101,2020TI101,Robert Rojas,Takimpo,98,12/28/2003
101,102,2020TI102,Andrew Anderson,Takimpo,73,9/1/2001
103,104,2020TI104,Daniel Smith,Takimpo,81,8/20/2003


# Tugas

#### 1. Data Encoding

In [77]:
from sklearn.preprocessing import LabelEncoder

# Label encoding untuk kolom 'alamat'
encoder = LabelEncoder()
data['alamat_encoded'] = encoder.fit_transform(data['alamat'])

# Menampilkan hasil
data[['alamat', 'alamat_encoded']].head()

Unnamed: 0,alamat,alamat_encoded
0,Kombeli,1
1,Dongkala,0
2,Kombeli,1
3,Takimpo,3
4,Dongkala,0


In [79]:
# One-hot encoding untuk kolom 'alamat'
data_encoded = pd.get_dummies(data, columns=['alamat'], prefix='alamat')

# Menampilkan hasil
data_encoded.head()

Unnamed: 0,no,nim,nama,nilai,tanggal_lahir,alamat_encoded,alamat_Dongkala,alamat_Kombeli,alamat_Pasarwajo,alamat_Takimpo
0,1,2020TI1,Lisa Watson,60,10/22/2002,1,False,True,False,False
1,2,2020TI2,Amy Weaver,70,12/12/1999,0,True,False,False,False
2,3,2020TI3,Julie Murray,59,4/7/2003,1,False,True,False,False
3,4,2020TI4,RUDI,95,2/16/2001,3,False,False,False,True
4,5,2020TI5,Kevin Hampton,87,3/24/2002,0,True,False,False,False


In [81]:
# Target encoding untuk kolom 'alamat' berdasarkan rata-rata 'nilai'
data['alamat_target'] = data.groupby('alamat')['nilai'].transform('mean')

# Menampilkan hasil
data[['alamat', 'alamat_target']].head()


Unnamed: 0,alamat,alamat_target
0,Kombeli,74.4
1,Dongkala,75.285714
2,Kombeli,74.4
3,Takimpo,73.645161
4,Dongkala,75.285714


#### 2. Splitting Data (Pembagian Data)

In [82]:
from sklearn.model_selection import train_test_split

# Misalnya, membagi data 80-20
train, test = train_test_split(data, test_size=0.2, random_state=42)

print("Train set:")
print(train.head())

print("\nTest set:")
print(test.head())

Train set:
    no       nim              nama    alamat  nilai tanggal_lahir  \
92  93  2020TI93    Jeffrey Santos   Kombeli     56     5/14/2003   
27  28  2020TI28       Shannon Orr  Dongkala     71     8/14/2000   
44  45  2020TI45      Sandra Craig   Takimpo     81     6/13/2003   
73  74  2020TI74       Katie Allen   Kombeli     89      1/1/2004   
15  16  2020TI16  Pamela Mccormick  Dongkala     77    12/11/1999   

    alamat_encoded  alamat_target  
92               1      74.400000  
27               0      75.285714  
44               3      73.645161  
73               1      74.400000  
15               0      75.285714  

Test set:
    no       nim               nama   alamat  nilai tanggal_lahir  \
87  88  2020TI88     Cheryl Jackson  Takimpo     70     2/21/2000   
58  59  2020TI59     Caroline Davis  Kombeli     87     5/20/1999   
69  70  2020TI70         Luke Cline  Kombeli     61    12/31/1999   
70  71  2020TI71  Christopher Payne  Kombeli     83     9/25/2001   
47

In [83]:
# Data dengan nilai >= 60
data_lulus = data[data['nilai'] >= 60]

# Data dengan nilai < 60
data_tidak_lulus = data[data['nilai'] < 60]

print("Data Lulus:")
print(data_lulus)

print("\nData Tidak Lulus:")
print(data_tidak_lulus)


Data Lulus:
      no        nim             nama     alamat  nilai tanggal_lahir  \
0      1    2020TI1      Lisa Watson    Kombeli     60    10/22/2002   
1      2    2020TI2       Amy Weaver   Dongkala     70    12/12/1999   
3      4    2020TI4             RUDI    Takimpo     95     2/16/2001   
4      5    2020TI5    Kevin Hampton   Dongkala     87     3/24/2002   
7      8    2020TI8             RUDI    Takimpo     75     5/18/2004   
..   ...        ...              ...        ...    ...           ...   
99   100  2020TI100   Shannon Guzman  Pasarwajo     70     5/28/2004   
100  101  2020TI101     Robert Rojas    Takimpo     98    12/28/2003   
101  102  2020TI102  Andrew Anderson    Takimpo     73      9/1/2001   
103  104  2020TI104     Daniel Smith    Takimpo     81     8/20/2003   
106  107  2020TI107   Jenna Delacruz    Kombeli     77     8/30/1999   

     alamat_encoded  alamat_target  
0                 1      74.400000  
1                 0      75.285714  
3           

In [84]:
mid_index = len(data) // 2

data_part1 = data.iloc[:mid_index]
data_part2 = data.iloc[mid_index:]

print("Data Part 1:")
print(data_part1)

print("\nData Part 2:")
print(data_part2)


Data Part 1:
    no       nim                nama    alamat  nilai tanggal_lahir  \
0    1   2020TI1         Lisa Watson   Kombeli     60    10/22/2002   
1    2   2020TI2          Amy Weaver  Dongkala     70    12/12/1999   
2    3   2020TI3        Julie Murray   Kombeli     59      4/7/2003   
3    4   2020TI4                RUDI   Takimpo     95     2/16/2001   
4    5   2020TI5       Kevin Hampton  Dongkala     87     3/24/2002   
5    6   2020TI6      Michael Kramer   Kombeli     54    10/29/1999   
6    7   2020TI7      Rebecca Porter  Dongkala     59      7/5/2000   
7    8   2020TI8                RUDI   Takimpo     75     5/18/2004   
8    9   2020TI9       Rachel Bailey   Kombeli     91     5/20/2001   
9   10  2020TI10         Rodney Yang   Takimpo     56     10/6/2000   
10  11  2020TI11       Nathan Phelps   Kombeli     66    10/27/2003   
11  12  2020TI12       Jeremy Gordon  Dongkala     55     2/11/2000   
12  13  2020TI13           John Pham  Dongkala     50     11/6/1