In [1]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

## Load Dataset

In [2]:
train=pd.read_csv('Train.csv')
test=pd.read_csv('Test.csv')

In [3]:
train.shape,test.shape

((209499, 43), (89786, 42))

## Combine train and test data

In [4]:
df=pd.concat([train,test]).reset_index().drop(['index','ID'],axis=1)

In [5]:
df.replace(' ?',np.nan,inplace=True)

In [6]:
df.isnull().sum()[df.isnull().sum()>0]

class                             150324
education_institute               280367
unemployment_reason               290085
is_labor_union                    270742
occupation_code_main              150967
under_18_family                   216626
veterans_admin_questionnaire      296274
country_of_birth_own                5157
country_of_birth_father            10142
country_of_birth_mother             9191
migration_code_change_in_msa      151881
migration_prev_sunbelt            275818
migration_code_move_within_reg    151881
migration_code_change_in_reg      151881
residence_1_year_ago              151881
old_residence_reg                 275818
old_residence_state               276856
income_above_limit                 89786
dtype: int64

### Note:drop greater than 75% null value features(or columns)

In [7]:
drop=(df.isnull().sum()/len(df))[(df.isnull().sum()/len(df)).gt(0.75)].index.to_list()

In [8]:
drop

['education_institute',
 'unemployment_reason',
 'is_labor_union',
 'veterans_admin_questionnaire',
 'migration_prev_sunbelt',
 'old_residence_reg',
 'old_residence_state']

In [9]:
df.drop(drop,axis=1,inplace=True)

In [10]:
df.iloc[:,30:]

Unnamed: 0,migration_code_move_within_reg,migration_code_change_in_reg,residence_1_year_ago,importance_of_record,income_above_limit
0,,,,1779.74,Below limit
1,unchanged,unchanged,Same,2366.75,Below limit
2,unchanged,unchanged,Same,1693.42,Below limit
3,unchanged,unchanged,Same,1380.27,Below limit
4,,,,1580.79,Below limit
...,...,...,...,...,...
299280,,,,938.83,
299281,,,,1985.66,
299282,unchanged,unchanged,Same,552.74,
299283,,,,1917.71,


In [11]:
df.iloc[:,:10].employment_stat.unique()

array([0, 2, 1], dtype=int64)

## Handle catgorical columns

In [12]:
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder

In [13]:
categorical_col=df.iloc[:,:-1].select_dtypes('object').columns

In [14]:
numerical_col=df.select_dtypes(['int64','float64']).columns

In [15]:
oe=OrdinalEncoder()

In [16]:
df[categorical_col]=oe.fit_transform(df[categorical_col])

In [17]:
df.income_above_limit=df.income_above_limit.map({'Below limit':0,'Above limit':1})

In [18]:
df.head()

Unnamed: 0,age,gender,education,class,marital_status,race,is_hispanic,employment_commitment,employment_stat,wage_per_hour,...,mig_year,country_of_birth_own,country_of_birth_father,country_of_birth_mother,migration_code_change_in_msa,migration_code_move_within_reg,migration_code_change_in_reg,residence_1_year_ago,importance_of_record,income_above_limit
0,79,0.0,12.0,,6.0,4.0,0.0,2.0,0,0,...,95,41.0,41.0,41.0,,,,,1779.74,0.0
1,65,0.0,12.0,,6.0,4.0,0.0,0.0,0,0,...,94,41.0,41.0,41.0,7.0,7.0,6.0,1.0,2366.75,0.0
2,21,1.0,2.0,0.0,4.0,2.0,0.0,0.0,0,500,...,94,41.0,41.0,41.0,7.0,7.0,6.0,1.0,1693.42,0.0
3,2,0.0,10.0,,4.0,1.0,0.0,0.0,0,0,...,94,41.0,18.0,18.0,7.0,7.0,6.0,1.0,1380.27,0.0
4,70,1.0,12.0,,2.0,4.0,0.0,2.0,0,0,...,95,41.0,41.0,41.0,,,,,1580.79,0.0


## Handle Missing value

In [19]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer,KNNImputer,SimpleImputer

In [20]:
df[numerical_col].isnull().sum()

age                      0
employment_stat          0
wage_per_hour            0
working_week_per_year    0
industry_code            0
occupation_code          0
total_employed           0
vet_benefit              0
gains                    0
losses                   0
stocks_status            0
mig_year                 0
importance_of_record     0
dtype: int64

In [21]:
df[categorical_col].isnull().sum()

gender                                 0
education                              0
class                             150324
marital_status                         0
race                                   0
is_hispanic                            0
employment_commitment                  0
industry_code_main                     0
occupation_code_main              150967
household_stat                         0
household_summary                      0
under_18_family                   216626
tax_status                             0
citizenship                            0
country_of_birth_own                5157
country_of_birth_father            10142
country_of_birth_mother             9191
migration_code_change_in_msa      151881
migration_code_move_within_reg    151881
migration_code_change_in_reg      151881
residence_1_year_ago              151881
dtype: int64

In [22]:
col=(df[categorical_col].isnull().sum()/len(df[categorical_col]))[(df[categorical_col].isnull().sum()/len(df[categorical_col])).gt(0)].index.to_list()
col

['class',
 'occupation_code_main',
 'under_18_family',
 'country_of_birth_own',
 'country_of_birth_father',
 'country_of_birth_mother',
 'migration_code_change_in_msa',
 'migration_code_move_within_reg',
 'migration_code_change_in_reg',
 'residence_1_year_ago']

In [23]:
for i in col:
    df[i]=df[i].fillna(df[i].mode()[0])

In [24]:
df

Unnamed: 0,age,gender,education,class,marital_status,race,is_hispanic,employment_commitment,employment_stat,wage_per_hour,...,mig_year,country_of_birth_own,country_of_birth_father,country_of_birth_mother,migration_code_change_in_msa,migration_code_move_within_reg,migration_code_change_in_reg,residence_1_year_ago,importance_of_record,income_above_limit
0,79,0.0,12.0,3.0,6.0,4.0,0.0,2.0,0,0,...,95,41.0,41.0,41.0,7.0,7.0,6.0,1.0,1779.74,0.0
1,65,0.0,12.0,3.0,6.0,4.0,0.0,0.0,0,0,...,94,41.0,41.0,41.0,7.0,7.0,6.0,1.0,2366.75,0.0
2,21,1.0,2.0,0.0,4.0,2.0,0.0,0.0,0,500,...,94,41.0,41.0,41.0,7.0,7.0,6.0,1.0,1693.42,0.0
3,2,0.0,10.0,3.0,4.0,1.0,0.0,0.0,0,0,...,94,41.0,18.0,18.0,7.0,7.0,6.0,1.0,1380.27,0.0
4,70,1.0,12.0,3.0,2.0,4.0,0.0,2.0,0,0,...,95,41.0,41.0,41.0,7.0,7.0,6.0,1.0,1580.79,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299280,5,1.0,10.0,3.0,4.0,4.0,0.0,0.0,0,0,...,95,41.0,41.0,41.0,7.0,7.0,6.0,1.0,938.83,
299281,81,0.0,5.0,3.0,2.0,4.0,0.0,2.0,0,0,...,95,41.0,41.0,41.0,7.0,7.0,6.0,1.0,1985.66,
299282,65,0.0,12.0,0.0,2.0,4.0,8.0,0.0,0,0,...,94,41.0,41.0,41.0,7.0,7.0,6.0,1.0,552.74,
299283,38,0.0,12.0,3.0,1.0,4.0,0.0,1.0,2,0,...,95,41.0,41.0,41.0,7.0,7.0,6.0,1.0,1917.71,


In [25]:
df[categorical_col].isnull().sum()

gender                            0
education                         0
class                             0
marital_status                    0
race                              0
is_hispanic                       0
employment_commitment             0
industry_code_main                0
occupation_code_main              0
household_stat                    0
household_summary                 0
under_18_family                   0
tax_status                        0
citizenship                       0
country_of_birth_own              0
country_of_birth_father           0
country_of_birth_mother           0
migration_code_change_in_msa      0
migration_code_move_within_reg    0
migration_code_change_in_reg      0
residence_1_year_ago              0
dtype: int64

## Make cluster [5,8,11,15]

In [26]:
df_clust=df.copy()

In [27]:
from sklearn.cluster import KMeans
for i in [5,8,11,15]:
    clust=KMeans(n_clusters=i,max_iter=1000,random_state=899)
    df[f'clust_{i}']=clust.fit_predict(df_clust.iloc[:,:-1])

In [28]:
df

Unnamed: 0,age,gender,education,class,marital_status,race,is_hispanic,employment_commitment,employment_stat,wage_per_hour,...,migration_code_change_in_msa,migration_code_move_within_reg,migration_code_change_in_reg,residence_1_year_ago,importance_of_record,income_above_limit,clust_5,clust_8,clust_11,clust_15
0,79,0.0,12.0,3.0,6.0,4.0,0.0,2.0,0,0,...,7.0,7.0,6.0,1.0,1779.74,0.0,0,4,0,3
1,65,0.0,12.0,3.0,6.0,4.0,0.0,0.0,0,0,...,7.0,7.0,6.0,1.0,2366.75,0.0,0,0,0,3
2,21,1.0,2.0,0.0,4.0,2.0,0.0,0.0,0,500,...,7.0,7.0,6.0,1.0,1693.42,0.0,0,4,0,3
3,2,0.0,10.0,3.0,4.0,1.0,0.0,0.0,0,0,...,7.0,7.0,6.0,1.0,1380.27,0.0,0,4,5,0
4,70,1.0,12.0,3.0,2.0,4.0,0.0,2.0,0,0,...,7.0,7.0,6.0,1.0,1580.79,0.0,0,4,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299280,5,1.0,10.0,3.0,4.0,4.0,0.0,0.0,0,0,...,7.0,7.0,6.0,1.0,938.83,,0,4,5,0
299281,81,0.0,5.0,3.0,2.0,4.0,0.0,2.0,0,0,...,7.0,7.0,6.0,1.0,1985.66,,0,4,0,3
299282,65,0.0,12.0,0.0,2.0,4.0,8.0,0.0,0,0,...,7.0,7.0,6.0,1.0,552.74,,0,4,5,0
299283,38,0.0,12.0,3.0,1.0,4.0,0.0,1.0,2,0,...,7.0,7.0,6.0,1.0,1917.71,,0,4,0,3


## Split data in train and test

In [29]:
train=df[~df.income_above_limit.isnull()]
test=df[df.income_above_limit.isnull()].dropna(axis=1)

In [30]:
train.income_above_limit=train.income_above_limit.astype('int64')

In [31]:
train.head()

Unnamed: 0,age,gender,education,class,marital_status,race,is_hispanic,employment_commitment,employment_stat,wage_per_hour,...,migration_code_change_in_msa,migration_code_move_within_reg,migration_code_change_in_reg,residence_1_year_ago,importance_of_record,income_above_limit,clust_5,clust_8,clust_11,clust_15
0,79,0.0,12.0,3.0,6.0,4.0,0.0,2.0,0,0,...,7.0,7.0,6.0,1.0,1779.74,0,0,4,0,3
1,65,0.0,12.0,3.0,6.0,4.0,0.0,0.0,0,0,...,7.0,7.0,6.0,1.0,2366.75,0,0,0,0,3
2,21,1.0,2.0,0.0,4.0,2.0,0.0,0.0,0,500,...,7.0,7.0,6.0,1.0,1693.42,0,0,4,0,3
3,2,0.0,10.0,3.0,4.0,1.0,0.0,0.0,0,0,...,7.0,7.0,6.0,1.0,1380.27,0,0,4,5,0
4,70,1.0,12.0,3.0,2.0,4.0,0.0,2.0,0,0,...,7.0,7.0,6.0,1.0,1580.79,0,0,4,0,3


In [32]:
test.head()

Unnamed: 0,age,gender,education,class,marital_status,race,is_hispanic,employment_commitment,employment_stat,wage_per_hour,...,country_of_birth_mother,migration_code_change_in_msa,migration_code_move_within_reg,migration_code_change_in_reg,residence_1_year_ago,importance_of_record,clust_5,clust_8,clust_11,clust_15
209499,54,1.0,12.0,3.0,2.0,4.0,0.0,0.0,0,600,...,41.0,7.0,7.0,6.0,1.0,3388.96,0,0,7,14
209500,53,1.0,4.0,3.0,2.0,4.0,1.0,1.0,0,0,...,7.0,7.0,7.0,6.0,1.0,1177.55,0,4,5,0
209501,42,1.0,9.0,3.0,2.0,4.0,0.0,1.0,1,0,...,41.0,7.0,7.0,6.0,1.0,4898.55,3,3,2,7
209502,16,0.0,6.0,3.0,4.0,4.0,0.0,0.0,0,0,...,41.0,7.0,7.0,6.0,1.0,1391.44,0,4,5,0
209503,16,1.0,6.0,3.0,4.0,4.0,0.0,2.0,0,0,...,41.0,7.0,7.0,6.0,1.0,1933.18,0,4,0,3


In [33]:
train.income_above_limit.value_counts()

0    196501
1     12998
Name: income_above_limit, dtype: int64

## Model train

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier

#### one of the best 0.620441242( public score on 0.30 boundary ) with 100% data

### 80% data analysis

In [35]:
X_train,X_test,y_train,y_test=train_test_split(train.drop('income_above_limit',axis=1),train.income_above_limit,train_size=0.8,random_state=0,stratify=train.income_above_limit)
params={
 'iterations': 5000,
 'random_state':0,
 'learning_rate': 0.02307399921119213
}
model=CatBoostClassifier(**params)
model.fit(X_train,y_train,verbose=False)
model.score(X_train,y_train),model.score(X_test,y_test),sum(model.predict(test))

(0.9704234512139094, 0.9583054892601433, 3542)

In [36]:
print(classification_report(y_test,model.predict(X_test))) #80% data

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     39300
           1       0.75      0.49      0.59      2600

    accuracy                           0.96     41900
   macro avg       0.86      0.74      0.79     41900
weighted avg       0.95      0.96      0.95     41900



In [37]:
# Public score at 0.30 boudary: 0.61065943 (iterations:1000)

### 100% data anaysis

In [38]:
X,y=train.drop('income_above_limit',axis=1),train.income_above_limit
params={
 'iterations': 5000,
 'random_state':0,
 'learning_rate': 0.02307399921119213
}
model=CatBoostClassifier(**params)
model.fit(X,y,verbose=False) 
model.score(X_train,y_train),model.score(X_test,y_test),sum(model.predict(test))

(0.9688959957994976, 0.9678997613365156, 3545)

In [39]:
print(classification_report(y_test,model.predict(X_test))) #100% data

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     39300
           1       0.86      0.57      0.69      2600

    accuracy                           0.97     41900
   macro avg       0.92      0.78      0.84     41900
weighted avg       0.97      0.97      0.96     41900



In [40]:
# Public score at 0.30 boudary: 0.620441242 (iterations:5000)

In [41]:
pred=np.where(model.predict_proba(test)[:,1]>0.30,1,0)
sum(pred)

5715

In [42]:
new_test=pd.read_csv('Test.csv')

In [43]:
submit=pd.DataFrame(new_test.ID)

In [44]:
submit['income_above_limit']=pred

In [45]:
submit

Unnamed: 0,ID,income_above_limit
0,ID_TZ209499,0
1,ID_TZ209500,0
2,ID_TZ209501,1
3,ID_TZ209502,0
4,ID_TZ209503,0
...,...,...
89781,ID_TZ299280,0
89782,ID_TZ299281,0
89783,ID_TZ299282,0
89784,ID_TZ299283,0


In [46]:
# submit.to_csv('submit.csv',index=False)

# -------------------------------------------