목표 설정
- 어떤 사람이 일년에 개인 수입이 $50k 달러 초과인지 예측해보자.

In [449]:
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [450]:
train = pd.read_csv('./train_null.csv')
train

print(train.shape)

(29305, 16)


In [451]:
test = pd.read_csv('./test_null.csv')
test
print(test.shape)

(19537, 15)


데이터 탐색

In [452]:
# 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation' ,'relationship', 'race' ,'sex' ,'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income'

# age : 나이
# workclass : 고용 형태
# fnlwgt : 사람 대표성을 나타내는 가중치 (final weight의 약자)
# education : 교육 수준
# education_num : 교육 수준 수치
# marital_status: 결혼 상태
# occupation : 업종
# relationship : 가족 관계
# race : 인종
# sex : 성별
# capital_gain : 양도 소득
# capital_loss : 양도 손실
# hours_per_week : 주당 근무 시간
# native_country : 국적
# income : 수익 (예측해야 하는 값)


In [453]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29305 entries, 0 to 29304
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   no              29305 non-null  int64 
 1   age             29305 non-null  int64 
 2   workclass       27642 non-null  object
 3   fnlwgt          29305 non-null  int64 
 4   education       29305 non-null  object
 5   education-num   29305 non-null  int64 
 6   marital-status  29305 non-null  object
 7   occupation      27637 non-null  object
 8   relationship    29305 non-null  object
 9   race            29305 non-null  object
 10  sex             29305 non-null  object
 11  capital-gain    29305 non-null  int64 
 12  capital-loss    29305 non-null  int64 
 13  hours-per-week  29305 non-null  int64 
 14  native-country  28810 non-null  object
 15  income          29305 non-null  int64 
dtypes: int64(8), object(8)
memory usage: 3.6+ MB


In [454]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19537 entries, 0 to 19536
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   no              19537 non-null  int64 
 1   age             19537 non-null  int64 
 2   workclass       18401 non-null  object
 3   fnlwgt          19537 non-null  int64 
 4   education       19537 non-null  object
 5   education-num   19537 non-null  int64 
 6   marital-status  19537 non-null  object
 7   occupation      18396 non-null  object
 8   relationship    19537 non-null  object
 9   race            19537 non-null  object
 10  sex             19537 non-null  object
 11  capital-gain    19537 non-null  int64 
 12  capital-loss    19537 non-null  int64 
 13  hours-per-week  19537 non-null  int64 
 14  native-country  19175 non-null  object
dtypes: int64(7), object(8)
memory usage: 2.2+ MB


In [455]:
# 결측값이 문자열로 되어있어서 찾지 못함
test.isnull().sum()


no                   0
age                  0
workclass         1136
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1141
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     362
dtype: int64

In [456]:
train.isnull().sum()

no                   0
age                  0
workclass         1663
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1668
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     495
income               0
dtype: int64

결측치 채워주기

In [457]:
# train 결측 열
null_col = train.isnull().any()
print(train.columns)
print(train.columns[null_col])

Index(['no', 'age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')
Index(['workclass', 'occupation', 'native-country'], dtype='object')


In [458]:
# test 결측 열
null_col = test.isnull().any()
print(test.columns)
print(test.columns[null_col])

Index(['no', 'age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')
Index(['workclass', 'occupation', 'native-country'], dtype='object')


In [459]:
# 결측값을 가진 열 
# 'workclass', 'occupation', 'native-country'

workclass 수정해보기

In [460]:
train[train['workclass'].isnull()]
train['workclass'].value_counts()

Private             20410
Self-emp-not-inc     2305
Local-gov            1868
State-gov            1201
Self-emp-inc          987
Federal-gov           854
Without-pay            12
Never-worked            5
Name: workclass, dtype: int64

workclass ==> 결측치가 존재하는 모든 행을 'private'으로 채우기
- 젤많으니까

In [461]:
train['workclass'] = train['workclass'].fillna('Private')

In [462]:
train[train['workclass'].isnull()]

Unnamed: 0,no,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income


In [463]:
test['workclass'] = test['workclass'].fillna('Private')

occupation 결측값 채우기

In [464]:
train['occupation'].value_counts()

Prof-specialty       3724
Craft-repair         3632
Exec-managerial      3609
Adm-clerical         3440
Sales                3292
Other-service        2975
Machine-op-inspct    1811
Transport-moving     1393
Handlers-cleaners    1249
Farming-fishing       888
Tech-support          865
Protective-serv       603
Priv-house-serv       146
Armed-Forces           10
Name: occupation, dtype: int64

In [465]:
train[train['occupation'].isnull()]

Unnamed: 0,no,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
19,20,18,Private,220168,Some-college,10,Never-married,,Own-child,White,Male,0,0,16,United-States,0
20,21,23,Private,194096,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,0
24,25,19,Private,50626,Some-college,10,Never-married,,Own-child,Black,Female,0,0,20,United-States,0
37,38,24,Private,152719,Some-college,10,Never-married,,Own-child,Black,Female,0,0,15,Haiti,0
47,48,23,Private,62507,Some-college,10,Never-married,,Not-in-family,White,Female,0,0,12,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29244,29245,28,Private,196630,Assoc-voc,11,Separated,,Unmarried,White,Female,0,0,40,Mexico,0
29247,29248,50,Private,23780,Masters,14,Married-spouse-absent,,Other-relative,White,Male,0,0,40,United-States,0
29280,29281,22,Private,110622,Bachelors,13,Never-married,,Own-child,Asian-Pac-Islander,Female,0,0,15,Taiwan,0
29286,29287,21,Private,143995,Some-college,10,Never-married,,Own-child,Black,Male,0,0,20,United-States,0


In [466]:
train['occupation']= train['occupation'].fillna('Prof-specialty')

In [467]:
train[train['occupation'].isnull()]

Unnamed: 0,no,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income


In [468]:
test['occupation']= test['occupation'].fillna('Prof-specialty')

In [469]:
test['occupation']

0         Prof-specialty
1          Other-service
2           Tech-support
3        Exec-managerial
4        Exec-managerial
              ...       
19532       Craft-repair
19533    Exec-managerial
19534     Prof-specialty
19535              Sales
19536       Tech-support
Name: occupation, Length: 19537, dtype: object

native-country

In [470]:
train[train['native-country'].isnull()]

Unnamed: 0,no,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
25,26,45,Private,179048,12th,8,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,35,,1
296,297,47,Private,174525,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,3942,0,40,,0
305,306,25,Private,124111,Bachelors,13,Never-married,Sales,Own-child,White,Female,0,0,36,,0
421,422,35,Private,103710,Bachelors,13,Divorced,Prof-specialty,Unmarried,White,Female,0,0,16,,0
432,433,29,Private,112403,5th-6th,3,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28885,28886,48,Private,117310,Bachelors,13,Divorced,Adm-clerical,Unmarried,White,Female,0,0,60,,0
28953,28954,33,Self-emp-not-inc,182926,Bachelors,13,Never-married,Transport-moving,Not-in-family,White,Male,0,0,40,,0
29067,29068,45,Private,199058,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,48,,0
29230,29231,42,Private,157367,HS-grad,9,Never-married,Adm-clerical,Unmarried,Black,Female,0,0,35,,0


In [471]:
train['native-country'].value_counts()

United-States                 26301
Mexico                          579
Philippines                     178
Germany                         124
Puerto-Rico                     117
Canada                          111
El-Salvador                      96
Cuba                             94
India                            90
England                          75
South                            74
Jamaica                          68
China                            68
Dominican-Republic               61
Italy                            60
Japan                            54
Columbia                         53
Poland                           53
Guatemala                        51
Haiti                            51
Vietnam                          50
Portugal                         43
Taiwan                           39
Iran                             38
Nicaragua                        28
Peru                             25
Greece                           25
Ireland                     

In [472]:
train['native-country'] = train['native-country'].fillna('United-States')

In [473]:
train['native-country']

0        United-States
1        United-States
2                Haiti
3        United-States
4        United-States
             ...      
29300      Puerto-Rico
29301    United-States
29302    United-States
29303    United-States
29304    United-States
Name: native-country, Length: 29305, dtype: object

In [486]:
test['native-country'] = test['native-country'].fillna('United-States')

In [487]:
X_train = train.iloc[  : , 0:15 ]
y_train = train.iloc[ : , 15 :]
X_test = test

print('x훈련 : ',X_train.shape)
print('y훈련 : ',y_train.shape)
print('테스트 : ',X_test.shape)

x훈련 :  (29305, 15)
y훈련 :  (29305, 1)
테스트 :  (19537, 15)


In [488]:
X_train = train.iloc[  : , 0:15 ]
y_train = train.iloc[ : , 15 :]
X_test = test

print('x훈련 : ',X_train.shape)
print('y훈련 : ',y_train.shape)
print('테스트 : ',X_test.shape)

x훈련 :  (29305, 15)
y훈련 :  (29305, 1)
테스트 :  (19537, 15)


In [489]:
cat_filter = (X_train.dtypes != 'int64') & (X_train.dtypes != 'float64')
cat_choice = X_train.columns[cat_filter]
cat_choice

In [499]:
for cat_name in cat_choice :
    # 컬럼 1개 인코딩
    dummy = pd.get_dummies(X_train[cat_name] , prefix=cat_name)
    # 기존 X_train 추가(병합, concat)
    X_train = pd.concat([X_train,dummy], axis = 1)
    X_train.drop(cat_name, axis = 1 ,inplace = True)

In [501]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29305 entries, 0 to 29304
Columns: 205 entries, no to native-country_Yugoslavia
dtypes: int64(7), uint8(198)
memory usage: 7.1 MB


In [495]:
# X_test 원핫 인코딩
for cat_name in cat_choice :
    # 컬럼 1개 인코딩
    dummy = pd.get_dummies(X_test[cat_name] , prefix=cat_name)
    # 기존 X_train 추가(병합, concat)
    X_test = pd.concat([X_test,dummy], axis = 1)

In [496]:
pd.get_dummies(X_train).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29305 entries, 0 to 29304
Columns: 205 entries, no to native-country_Yugoslavia
dtypes: int64(7), uint8(198)
memory usage: 7.1 MB


In [481]:
X_train['native-country'] 

0        United-States
1        United-States
2                Haiti
3        United-States
4        United-States
             ...      
29300      Puerto-Rico
29301    United-States
29302    United-States
29303    United-States
29304    United-States
Name: native-country, Length: 29305, dtype: object

In [482]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19537 entries, 0 to 19536
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   no              19537 non-null  int64 
 1   age             19537 non-null  int64 
 2   workclass       19537 non-null  object
 3   fnlwgt          19537 non-null  int64 
 4   education       19537 non-null  object
 5   education-num   19537 non-null  int64 
 6   marital-status  19537 non-null  object
 7   occupation      19537 non-null  object
 8   relationship    19537 non-null  object
 9   race            19537 non-null  object
 10  sex             19537 non-null  object
 11  capital-gain    19537 non-null  int64 
 12  capital-loss    19537 non-null  int64 
 13  hours-per-week  19537 non-null  int64 
 14  native-country  19175 non-null  object
dtypes: int64(7), object(8)
memory usage: 2.2+ MB


In [502]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [503]:
tree_model = DecisionTreeClassifier()


In [513]:
tree_model.fit(X_train,y_train)

In [511]:
X_test['native-country_Holand-Netherlands'] = 0

In [512]:
X_test = X_test[X_train.columns]
X_test.columns

Index(['no', 'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'workclass_Federal-gov', 'workclass_Local-gov',
       'workclass_Never-worked',
       ...
       'native-country_Portugal', 'native-country_Puerto-Rico',
       'native-country_Scotland', 'native-country_South',
       'native-country_Taiwan', 'native-country_Thailand',
       'native-country_Trinadad&Tobago', 'native-country_United-States',
       'native-country_Vietnam', 'native-country_Yugoslavia'],
      dtype='object', length=205)

In [514]:
pre  = tree_model.predict(X_test)

In [None]:
# 평가 진행 > y_test는 Kaggle에 올려보기
result = pd.read_csv('./sample_submission.csv')
result['income'] = pre
# csv형식으로 파일 저장

result.to_csv('test.csv' , index=False)