In [1]:
!pip install pycaret

In [1]:
import pandas as pd
import numpy as np

In [1]:
train = pd.read_csv('../input/kakr-4th-competition/train.csv')
test = pd.read_csv('../input/kakr-4th-competition/test.csv')

In [1]:
train

In [1]:
train.income = (train['income']==('>50K')).astype('int')

In [1]:
total = pd.concat([train, test])

In [1]:
total

In [1]:
total = total.drop(labels=['id', 'fnlwgt', 'education'], axis=1)

# Age

In [1]:
age_labels = [i for i in range(19)]
age_bands = [i for i in range(15, 72, 3)] + [90]
total['age_band']= pd.cut(total['age'], bins=age_bands, labels=age_labels)

In [1]:
total = total.drop('age', axis=1)

# Workclass

In [1]:
total.loc[total['workclass'].isin(['Never-worked', 'Without-pay']), 'workclass'] = 'others'

In [1]:
total

In [1]:
total['workclass'].unique()

# Education Num

In [1]:
edu_bands = [0, 1, 5, 8, 10, 12, 13, 14, 16]
edu_labels = [i for i in range(len(edu_bands)-1)]
total['edu_band']= pd.cut(total['education_num'], bins=edu_bands, labels=edu_labels)

In [1]:
total=total.drop('education_num', axis=1)

# Captial Gain & Loss

In [1]:
total["log_capital_gain"] = total.capital_gain.map(lambda x: np.log(x, where=(x!=0)))

In [1]:
total["log_capital_loss"] = total.capital_loss.map(lambda x: np.log(x, where=(x!=0)))

In [1]:
total = total.drop(['capital_gain', 'capital_loss'], axis=1)

# Hours Per Week

In [1]:
workhours_bands = [i for i in range(0, 65, 5)] + [80, 100]
workhours_labels = [i for i in range(len(workhours_bands) -1)]
total['workhours_band']= pd.cut(total['hours_per_week'], bins=workhours_bands, labels=workhours_labels)

In [1]:
total = total.drop(['hours_per_week'], axis=1)

# 7. Marital Status & Relationship

In [1]:
total.loc[total['relationship'].isin(['Husband', 'Wife']), 'relationship'] = 'Normal'
total.loc[total['relationship'].isin(['Own-child', 'Not-in-family', 'Unmarried', 'Other-relative']), 'relationship'] = 'Abnormal'

In [1]:
total.loc[total['marital_status'].isin(['Married-AF-spouse', 'Married-civ-spouse']), 'marital_status'] = 'Married_OK'
total.loc[total['marital_status'].isin(['Married-spouse-absent', 'Divorced', 'Never-married', 'Separated', 'Widowed']), 'marital_status']= 'Married_Fail'

# 8. Occupations

In [1]:
total.loc[total['occupation'].isin(['Armed-Forces', 'Priv-house-serv']), 'occupation'] = 'Priv-house-serv'

# 9. Race

In [1]:
total.loc[total['race'].isin(['Amer-Indian-Eskimo', 'Other']), 'race'] = 'Other'

# 10. Native Country

In [1]:
income_01 = ['Outlying-US(Guam-USVI-etc)',
 'Honduras',
 'Columbia',
 'Dominican-Republic',
 'Mexico',
 'Guatemala',
 'Portugal',
 'Trinadad&Tobago',
 'Nicaragua',
 'Peru',
 'Vietnam',
 'El-Salvador',]

income_02 = ['Jamaica',
 'Haiti',
 'Puerto-Rico',
 'Laos',
 'Thailand',
 'Ecuador',
 'Poland',
 'South',
 'Ireland',
 'China',]

income_03 = ['?']

income_04 = [
    'United-States',
]
income_05 = [
 'Greece',
 'Scotland',
 'Cuba',
 'Hungary',
 'Hong',
]
income_06 = [
 'Philippines',
 'Canada',
 'England',
 'Germany',
 'Italy'
]
income_07 = [
 'India',
 'Japan',
 'France',
 'Yugoslavia',
 'Cambodia',
 'Taiwan',
 'Iran'
]

In [1]:
def convert_country(x):
    if x in income_01:
        return 'income_01'
    elif x in income_02:
        return 'income_02'
    elif x in income_03:
        return 'income_03'
    elif x in income_04:
        return 'income_04'
    elif x in income_05:
        return 'income_05'
    elif x in income_06:
        return 'income_06'
    elif x in income_07:
        return 'income_07'

In [1]:
total['country_bin'] = total['native_country'].apply(convert_country)

In [1]:
total = total.drop(['native_country'], axis = 1)

# Features to Model

In [1]:
total.columns

In [1]:
features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race',
       'sex', 'age_band', 'edu_band', 'log_capital_gain',
       'log_capital_loss', 'workhours_band', 'country_bin']
label = ['income']

In [1]:
from pycaret.classification import *

In [1]:
total_carret = total[features + label]

In [1]:
total_carret.head()

In [1]:
total_carret['age_band'] = total_carret['age_band'].astype('float')
total_carret['edu_band'] = total_carret['edu_band'].astype('float')
total_carret['workhours_band'] = total_carret['workhours_band'].astype('float')
total_carret['log_capital_gain'] = total_carret['log_capital_gain'].astype('float')
total_carret['log_capital_loss'] = total_carret['log_capital_loss'].astype('float')

In [1]:
train_clean = total_carret[:len(train)]
test_clean = total_carret[len(train):]

In [1]:
train_clean['income'] = train_clean['income'].astype('int')

In [1]:
train_clean.head()

In [1]:
setup(data = train_clean, target = 'income', session_id=1234, silent=True)

In [1]:
compared_model = compare_models(sort = 'F1', n_select = 3)

In [1]:
blended_model = blend_models(estimator_list = compared_model, fold = 5, method = 'soft')

In [1]:
final_model = finalize_model(blended_model)
ensemble_prediction = predict_model(final_model,test_clean)

In [1]:
save_model(final_model, 'final201209')

In [1]:
plot_model(estimator = final_model, plot = 'auc')
plot_model(estimator = final_model, plot = 'confusion_matrix')

In [1]:
ensemble_prediction['id'] = np.arange(len(ensemble_prediction))
ensemble_prediction['prediction'] = ensemble_prediction['Label']

In [1]:
ensemble_prediction = ensemble_prediction[['id','prediction']]

In [1]:
ensemble_prediction.to_csv('submission.csv',index=False)