# LIBRARY

In [64]:
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn import metrics
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as prep

# FUNCTION

In [37]:
def encoder(df, columns):
    for x in columns :
        labelEncoder = prep.LabelEncoder()
        labelEncoder.fit(df[x])
        df[(x + "_code")] = labelEncoder.transform(df[x])
        df = df.drop([x],axis='columns')
    return df

# DATA

In [3]:
df1 = pd.read_csv('american_income_1994.csv')
df1.shape

(48842, 15)

In [4]:
df1.describe()

Unnamed: 0,age,fnlwft,education_num,capital_gain,capital_loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       48842 non-null  object
 2   fnlwft          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education_num   48842 non-null  int64 
 5   marital_status  48842 non-null  object
 6   occupation      48842 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital_gain    48842 non-null  int64 
 11  capital_loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native_country  48842 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


# PRE-PROCESSING DATA


## DATA CLEANING

### Duplicated Data Handling

In [6]:
df1.duplicated().sum()

29

In [8]:
df2 = df1.drop_duplicates(keep='first')

### Missing Value Handling


In [10]:
df2.isnull().sum()

age               0
workclass         0
fnlwft            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours-per-week    0
native_country    0
income            0
dtype: int64

### Encoding

In [28]:
df3 = df2.copy()

In [12]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48813 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48813 non-null  int64 
 1   workclass       48813 non-null  object
 2   fnlwft          48813 non-null  int64 
 3   education       48813 non-null  object
 4   education_num   48813 non-null  int64 
 5   marital_status  48813 non-null  object
 6   occupation      48813 non-null  object
 7   relationship    48813 non-null  object
 8   race            48813 non-null  object
 9   sex             48813 non-null  object
 10  capital_gain    48813 non-null  int64 
 11  capital_loss    48813 non-null  int64 
 12  hours-per-week  48813 non-null  int64 
 13  native_country  48813 non-null  object
 14  income          48813 non-null  object
dtypes: int64(6), object(9)
memory usage: 6.0+ MB


Dapat dilihat terdapat beberapa kolom yang nilainya bukan numerik.

1. Workclass
2. education
3. martial_status
4. occupation
5. relationship
6. race
7. sex
8. native_country
9. income

Semuanya akan diubah menjadi numerik dengan LabelEncoder,
ini tidak masalah karena masalah klasifikasi sehingga nilai numeriknya tidak memiliki arti 

In [51]:
df4 = encoder(df3,['workclass','education','marital_status','occupation','relationship','race','sex','native_country'])
df4

Unnamed: 0,age,fnlwft,education_num,capital_gain,capital_loss,hours-per-week,income,workclass_code,education_code,marital_status_code,occupation_code,relationship_code,race_code,sex_code,native_country_code,income_code
0,39,77516,13,2174,0,40,<=50K,7,9,4,1,1,4,1,39,0
1,50,83311,13,0,0,13,<=50K,6,9,2,4,0,4,1,39,0
2,38,215646,9,0,0,40,<=50K,4,11,0,6,1,4,1,39,0
3,53,234721,7,0,0,40,<=50K,4,1,2,6,0,2,1,39,0
4,28,338409,13,0,0,40,<=50K,4,9,2,10,5,2,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,215419,13,0,0,36,<=50K.,4,9,0,10,1,4,0,39,1
48838,64,321403,9,0,0,40,<=50K.,0,11,6,0,2,2,1,39,1
48839,38,374983,13,0,0,50,<=50K.,4,9,2,10,0,4,1,39,1
48840,44,83891,13,5455,0,40,<=50K.,4,9,0,1,3,1,1,39,1


In [52]:
df = df4.copy()

# EDA 

Kita Lewati dulu karena fokusnya adalah belajar algoritma

## MODELING

In [54]:
X = df.drop(['income', 'fnlwft'], axis='columns')
y = df['income']

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=90)

In [70]:
model = ensemble.RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

In [75]:
y_pred = model.predict(X_test)
y_pred

array([' <=50K', ' <=50K.', ' <=50K', ..., ' >50K.', ' <=50K', ' <=50K.'],
      dtype=object)

In [72]:
accuracy = metrics.accuracy_score(y_test, y_pred)
report = metrics.classification_report(y_test, y_pred)

print(accuracy)
print(report)
print(model.feature_importances_)

1.0
              precision    recall  f1-score   support

       <=50K       1.00      1.00      1.00      4878
      <=50K.       1.00      1.00      1.00      2546
        >50K       1.00      1.00      1.00      1580
       >50K.       1.00      1.00      1.00       759

    accuracy                           1.00      9763
   macro avg       1.00      1.00      1.00      9763
weighted avg       1.00      1.00      1.00      9763

[0.01877513 0.02076699 0.02216593 0.0061225  0.01300594 0.00476648
 0.00646219 0.02133275 0.00837458 0.03203586 0.00195654 0.00241841
 0.0026838  0.83913291]
