In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
train =pd.read_csv('train.csv')

In [3]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income_>50K
0,67,Private,366425,Doctorate,16,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,60,United-States,1
1,17,Private,244602,12th,8,Never-married,Other-service,Own-child,White,Male,0,0,15,United-States,0
2,31,Private,174201,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
3,58,State-gov,110199,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,0
4,25,State-gov,149248,Some-college,10,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,United-States,0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43957 entries, 0 to 43956
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              43957 non-null  int64 
 1   workclass        41459 non-null  object
 2   fnlwgt           43957 non-null  int64 
 3   education        43957 non-null  object
 4   educational-num  43957 non-null  int64 
 5   marital-status   43957 non-null  object
 6   occupation       41451 non-null  object
 7   relationship     43957 non-null  object
 8   race             43957 non-null  object
 9   gender           43957 non-null  object
 10  capital-gain     43957 non-null  int64 
 11  capital-loss     43957 non-null  int64 
 12  hours-per-week   43957 non-null  int64 
 13  native-country   43194 non-null  object
 14  income_>50K      43957 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 5.0+ MB


In [5]:
train.isna().sum()

age                   0
workclass          2498
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2506
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      763
income_>50K           0
dtype: int64

In [6]:
train = train.fillna(-1)

In [7]:
train.isna().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income_>50K        0
dtype: int64

In [8]:

cat_feats = train.select_dtypes(include=[np.object]).columns

for cat_feat in cat_feats:
    train['{0}_cat'.format(cat_feat)] = pd.factorize(train[cat_feat])[0]
 

In [9]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,...,native-country,income_>50K,workclass_cat,education_cat,marital-status_cat,occupation_cat,relationship_cat,race_cat,gender_cat,native-country_cat
0,67,Private,366425,Doctorate,16,Divorced,Exec-managerial,Not-in-family,White,Male,...,United-States,1,0,0,0,0,0,0,0,0
1,17,Private,244602,12th,8,Never-married,Other-service,Own-child,White,Male,...,United-States,0,0,1,1,1,1,0,0,0
2,31,Private,174201,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,United-States,1,0,2,2,0,2,0,0,0
3,58,State-gov,110199,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,White,Male,...,United-States,0,1,3,2,2,2,0,0,0
4,25,State-gov,149248,Some-college,10,Never-married,Other-service,Not-in-family,Black,Male,...,United-States,0,1,4,1,1,0,1,0,0


In [10]:

def middle_age_man(row):
    good_age = range(35,60)
    return int(good_age.count(row['age']) and row['gender'] == True)

def working_vs_ocuppation(row):
    if (row['working_less_or_more_4'] in [3,4]) and (row['occupation_cat'] in [0,1,3,5,10]):
        return row['working_less_or_more_4'] + row['occupation_cat']
    else:
        return 0

def working_more_4(row):
    if row['hours-per-week'] < 35:
        more = 0
    elif row['hours-per-week'] >=35 and  row['hours-per-week'] <= 42:
        more = 1
    elif row['hours-per-week'] > 42 and row['hours-per-week'] <= 73:
        more = 3 
    elif row['hours-per-week'] > 73:
        more = 2
    else:
        more = 'to_check'
    return more

def occupation_2(row): #new
    return int(row['occupation'] in ['Prof-specialty','Exec-managerial'] and row['gender'] == True)
        
train['Education_is_high'] = (train.education.isin(['Masters', 'Doctorate', 'Prof-school']))
train['middle_age_man'] = train.apply(middle_age_man, axis=1)
train['is_US'] = (train['native-country'].isin(['United-States']))
train['working_less_or_more_4'] = train.apply(working_more_4, axis = 1)
train['working_vs_ocuppation_test'] = train.apply(working_vs_ocuppation, axis = 1)
train['husband_wife'] = train['relationship'].map(lambda x: int((x =='Husband') | (x == 'Wife') )) #new
train['occupation_2'] = train.apply(occupation_2, axis=1) #new


In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43957 entries, 0 to 43956
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   age                         43957 non-null  int64 
 1   workclass                   43957 non-null  object
 2   fnlwgt                      43957 non-null  int64 
 3   education                   43957 non-null  object
 4   educational-num             43957 non-null  int64 
 5   marital-status              43957 non-null  object
 6   occupation                  43957 non-null  object
 7   relationship                43957 non-null  object
 8   race                        43957 non-null  object
 9   gender                      43957 non-null  object
 10  capital-gain                43957 non-null  int64 
 11  capital-loss                43957 non-null  int64 
 12  hours-per-week              43957 non-null  int64 
 13  native-country              43957 non-null  ob

In [12]:
train.drop('gender_cat', axis=1, inplace=True)

In [13]:
features = train.select_dtypes(include=['int8', 'int16', 'int64', 'bool']).columns.values

In [14]:
features

array(['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'income_>50K', 'workclass_cat', 'education_cat',
       'marital-status_cat', 'occupation_cat', 'relationship_cat',
       'race_cat', 'native-country_cat', 'Education_is_high',
       'middle_age_man', 'is_US', 'working_less_or_more_4',
       'working_vs_ocuppation_test', 'husband_wife', 'occupation_2'],
      dtype=object)

In [15]:
features = features[features!=['income_>50K']]

In [16]:
features

array(['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'workclass_cat', 'education_cat',
       'marital-status_cat', 'occupation_cat', 'relationship_cat',
       'race_cat', 'native-country_cat', 'Education_is_high',
       'middle_age_man', 'is_US', 'working_less_or_more_4',
       'working_vs_ocuppation_test', 'husband_wife', 'occupation_2'],
      dtype=object)

In [17]:
X = train[ features ].values
y = train['income_>50K'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2019)


In [18]:
print("Train:", X_train.shape, y_train.shape)
print("Test:", X_test.shape, y_test.shape)

Train: (30769, 20) (30769,)
Test: (13188, 20) (13188,)


In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.metrics import precision_score, recall_score

In [20]:
def train_and_predict_model(X_train, X_test, y_train, y_test, model, success_metric=accuracy_score):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print("Distribution:")
    print( pd.Series(y_pred).value_counts() )
    
    return success_metric(y_test, y_pred)

#model = DecisionTreeClassifier(max_depth=10, random_state=2019)
model = RandomForestClassifier(max_depth = 9, n_estimators = 40, random_state=2019)

train_and_predict_model(X_train, X_test, y_train, y_test, model)

Distribution:
0    10995
1     2193
dtype: int64


0.860858356081286

In [21]:
y_pred = pd.DataFrame(model.predict(X_test))

In [22]:
y_pred

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
13183,0
13184,0
13185,0
13186,1


In [23]:
test =pd.read_csv('test.csv')

In [24]:
test.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,39,Self-emp-not-inc,327120,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,Portugal
1,32,Private,123253,Assoc-acdm,12,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,42,United-States
2,47,Private,232628,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,Black,Male,0,0,40,United-States
3,19,Private,374262,12th,8,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,20,United-States
4,46,Self-emp-not-inc,311231,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,40,United-States


In [25]:
test.isna().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
dtype: int64

In [26]:
cat_feats = test.select_dtypes(include=[np.object]).columns

for cat_feat in cat_feats:
    test['{0}_cat'.format(cat_feat)] = pd.factorize(test[cat_feat])[0]
 

In [27]:
test.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,...,hours-per-week,native-country,workclass_cat,education_cat,marital-status_cat,occupation_cat,relationship_cat,race_cat,gender_cat,native-country_cat
0,39,Self-emp-not-inc,327120,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,...,40,Portugal,0,0,0,0,0,0,0,0
1,32,Private,123253,Assoc-acdm,12,Married-civ-spouse,Craft-repair,Husband,White,Male,...,42,United-States,1,1,0,0,0,0,0,1
2,47,Private,232628,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,Black,Male,...,40,United-States,1,0,0,0,0,1,0,1
3,19,Private,374262,12th,8,Never-married,Handlers-cleaners,Own-child,White,Male,...,20,United-States,1,2,1,1,1,0,0,1
4,46,Self-emp-not-inc,311231,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,...,40,United-States,0,0,0,2,0,0,0,1


In [28]:

def middle_age_man(row):
    good_age = range(35,60)
    return int(good_age.count(row['age']) and row['gender'] == True)

def working_vs_ocuppation(row):
    if (row['working_less_or_more_4'] in [3,4]) and (row['occupation_cat'] in [0,1,3,5,10]):
        return row['working_less_or_more_4'] + row['occupation_cat']
    else:
        return 0

def working_more_4(row):
    if row['hours-per-week'] < 35:
        more = 0
    elif row['hours-per-week'] >=35 and  row['hours-per-week'] <= 42:
        more = 1
    elif row['hours-per-week'] > 42 and row['hours-per-week'] <= 73:
        more = 3 
    elif row['hours-per-week'] > 73:
        more = 2
    else:
        more = 'to_check'
    return more

def occupation_2(row): #new
    return int(row['occupation'] in ['Prof-specialty','Exec-managerial'] and row['gender'] == True)
        
test['Education_is_high'] = (test.education.isin(['Masters', 'Doctorate', 'Prof-school']))
test['middle_age_man'] = test.apply(middle_age_man, axis=1)
test['is_US'] = (test['native-country'].isin(['United-States']))
test['working_less_or_more_4'] = test.apply(working_more_4, axis = 1)
test['working_vs_ocuppation_test'] = test.apply(working_vs_ocuppation, axis = 1)
test['husband_wife'] = test['relationship'].map(lambda x: int((x =='Husband') | (x == 'Wife') )) #new
test['occupation_2'] = test.apply(occupation_2, axis=1) #new


In [29]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 899 entries, 0 to 898
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   age                         899 non-null    int64 
 1   workclass                   899 non-null    object
 2   fnlwgt                      899 non-null    int64 
 3   education                   899 non-null    object
 4   educational-num             899 non-null    int64 
 5   marital-status              899 non-null    object
 6   occupation                  899 non-null    object
 7   relationship                899 non-null    object
 8   race                        899 non-null    object
 9   gender                      899 non-null    object
 10  capital-gain                899 non-null    int64 
 11  capital-loss                899 non-null    int64 
 12  hours-per-week              899 non-null    int64 
 13  native-country              899 non-null    object

In [30]:
test.drop('gender_cat', axis=1, inplace=True)

In [31]:
featuresss = test.select_dtypes(include=['int8', 'int16', 'int64', 'bool']).columns.values

In [32]:
featuresss

array(['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'workclass_cat', 'education_cat',
       'marital-status_cat', 'occupation_cat', 'relationship_cat',
       'race_cat', 'native-country_cat', 'Education_is_high',
       'middle_age_man', 'is_US', 'working_less_or_more_4',
       'working_vs_ocuppation_test', 'husband_wife', 'occupation_2'],
      dtype=object)

In [33]:
featuresss = featuresss[featuresss!=['income_>50K']]

In [34]:
X_new = test[ featuresss ].values

In [35]:
test['y_new_pred']= pd.DataFrame(model.predict(X_new))

In [36]:
y_new_pred= pd.DataFrame(model.predict(X_new))

In [37]:
test.to_csv('output1.csv')