# Jurnal Kelompok 2 PBF RC

## Import library

In [1]:
import pandas as pd
import numpy as np

## Import dataset

In [2]:
data = pd.read_csv("C:/Users/rafif/Downloads/income.csv")
data

Unnamed: 0,no,age,income,gender
0,1,29.0,12247035.0,m
1,2,35.0,11561761.0,m
2,3,26.0,5009745.0,f
3,4,25.0,10168417.0,m
4,5,31.0,5315979.0,m
...,...,...,...,...
101,102,32.0,2904282.0,f
102,103,21.0,,m
103,104,22.0,2609364.0,m
104,105,22.0,16221406.0,f


## Cek type data

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   no       106 non-null    int64  
 1   age      103 non-null    float64
 2   income   101 non-null    float64
 3   gender   106 non-null    object 
dtypes: float64(2), int64(1), object(1)
memory usage: 3.4+ KB


## Menghitung jumlah NaN dalam dataset
setelah di cek ternyata masih ada data yang kosong(NaN)

In [4]:
data.isna().sum()

no         0
age        3
income     5
gender     0
dtype: int64

## Data Preprocessing
Melakukan data cleaning menghapus missing values, membuang data duplikat, convert data, membersihkan nilai yang tidak valid dan tidak konsisten

In [5]:
# Clean data
def clean_data(data):
    # Remove missing values
    data = data.dropna()
    # Remove duplicates
    data = data.drop_duplicates()
    # Convert data types
    data['age '] = data['age '].astype(int)
    data['income '] = data['income '].astype(int)
   # Membersihkan nilai yang tidak valid
    data = data[data['age '] > 0]
    data = data[data['income '] > 0]
    # Membersihkan nilai yang tidak konsisten
    data['gender'] = data['gender'].apply(lambda x: x.lower())
    data.loc[data['gender'] == 'm', 'gender'] = 'male'
    data.loc[data['gender'] == 'f', 'gender'] = 'female'
    return data

In [6]:
# Membersihkan dataset
cleaned_data = clean_data(data)
cleaned_data

Unnamed: 0,no,age,income,gender
0,1,29,12247035,male
1,2,35,11561761,male
2,3,26,5009745,female
3,4,25,10168417,male
4,5,31,5315979,male
...,...,...,...,...
100,101,20,8413164,male
101,102,32,2904282,female
103,104,22,2609364,male
104,105,22,16221406,female


## Cek kembali data NaN setelah dilakukan data preprocessing 
setelah dilakukan data preprocessing terlihat bahwa tidak ada lagi data kosong(NaN)

In [7]:
cleaned_data.isna().sum()

no         0
age        0
income     0
gender     0
dtype: int64

## Menggunakan fungsi reduce untuk menghitung jumlah pendapatan
menghitung total keseluruhan yang ada di kolom income didapatkan hasil Rp. 883.912.167

In [8]:
from functools import reduce
total_income = reduce(lambda x, y: x + y, cleaned_data['income '])
total_income

883912167

## Menggunakan fungsi filter untuk memfilter data yang sesuai dengan kriteria tertentu
### Memfilter income > 15 juta
Ternyata dalam rentang umur 17-40 tahun pada dataset ini yang memiliki income diatas 15 juta sebanyak 13 orang

In [9]:
high_income_data = list(filter(lambda x: x >  15000000, cleaned_data['income ']))
high_income_data

[15568213,
 18509612,
 17916690,
 15195135,
 15283401,
 15523178,
 19702728,
 19233076,
 17374661,
 15692471,
 16501100,
 16221406,
 16066811]

## Ubah Data ke list
Mengubah data ke list untuk proses selanjutnya yakni melakukan perhitungan

In [10]:
income = cleaned_data['income '].values.tolist()
print(income)

[12247035, 11561761, 5009745, 10168417, 5315979, 13215435, 8545991, 10601138, 7487848, 8278305, 7238529, 3861080, 6175578, 12457163, 3128005, 10775909, 7867990, 9223945, 12522466, 1746155, 6295588, 4300675, 2010293, 12943481, 12384326, 13402913, 8341348, 7303021, 5546868, 4849015, 1202219, 9493833, 4586386, 10684558, 2619574, 5491073, 3590726, 12635308, 14098844, 10183878, 12058362, 3157030, 7990682, 2060174, 5272290, 7729875, 7635726, 12171291, 11960693, 15568213, 2200623, 14204711, 4221789, 12466875, 2283945, 14744519, 18509612, 13692887, 17916690, 14307083, 15195135, 4288340, 4461255, 2046312, 2953968, 6770471, 7138789, 6409342, 10579788, 5759617, 6957119, 8516020, 7394177, 4280829, 15283401, 15523178, 5979150, 12868350, 19702728, 7658262, 19233076, 17374661, 15692471, 14071992, 11606073, 13609001, 4990346, 14110117, 7440779, 8801208, 2422014, 2532600, 16501100, 8413164, 2904282, 2609364, 16221406, 16066811]


In [11]:
age = cleaned_data['age '].values.tolist()
print(age)

[29, 35, 26, 25, 31, 21, 21, 22, 38, 31, 34, 33, 40, 35, 36, 28, 21, 28, 36, 40, 36, 23, 38, 24, 34, 40, 33, 32, 36, 21, 22, 31, 26, 33, 23, 28, 40, 21, 30, 34, 29, 34, 20, 38, 20, 21, 30, 18, 19, 24, 23, 22, 40, 37, 37, 36, 37, 36, 29, 23, 23, 30, 20, 17, 23, 26, 19, 32, 29, 24, 36, 31, 39, 37, 28, 31, 21, 31, 35, 35, 24, 21, 21, 31, 23, 39, 37, 39, 39, 27, 22, 37, 22, 20, 32, 22, 22, 33]


## Menghitung rata-rata menggunakan map() dan fungsi lambda
Didapatkan rata rata income pada dataset ini sebesar Rp. 9.019.511,90 dengan rata rata umur 29,2 tahun

In [12]:
avarage_inc = sum(list(map(lambda x: float(x), income))) / len(income)
print(avarage_inc)

9019511.908163264


In [13]:
avarage_age = sum(list(map(lambda x: float(x), age))) / len(age)
print(avarage_age)

29.244897959183675


## Menghitung Median dengan User Defined Function

In [14]:
def median(values):
    sorted_values = sorted(values)
    if len(sorted_values) % 2 == 0:
        return (sorted_values[len(sorted_values) // 2] + sorted_values[len(sorted_values) // 2 - 1]) / 2
    else:
        return sorted_values[len(sorted_values) // 2]

median_age = median(cleaned_data['age '])
median_income = median(cleaned_data['income '])

print("Median Age : " , median_age)
print("Median Income : ", median_income)

Median Age :  30.0
Median Income :  8309826.5


## Built-in Function

Income terbesar pada dataset ini ialah Rp. 19.702.728 <br>
Income terkecil pada dataset ini ialah Rp. 1.202.219 <br>
Rentang umur yang ada pada dataset ini ialah 17 - 40 tahun

In [15]:
max((x) for x in income)

19702728

In [16]:
min((x) for x in income)

1202219

In [17]:
max((x) for x in age)

40

In [18]:
min((x) for x in age)

17