In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score, confusion_matrix
import warnings
warnings.simplefilter('ignore')

file_path = '../input/income-classification/income_evaluation.csv'
df = pd.read_csv(file_path, skipinitialspace = True)
df.drop_duplicates(inplace = True)
df.rename(columns=lambda x: x.strip(), inplace=True)

print(df.columns)

print(df.info())
print(df.head(3),df.tail(3))

df.describe()

In [None]:
num_features = ['sex','age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']


cat_features = ['workclass','marital-status','occupation','relationship','race','native-country']




## Analysis on Target

In [None]:
x_0 = df.groupby(['income']).count().iloc[0,0]
x_1 = df.groupby(['income']).count().iloc[1,0]

pie, ax = plt.subplots(figsize=[10,6])
labels = ['income <= 50k','income > 50k']
plt.pie(x=[x_0,x_1], autopct="%.1f%%", labels = labels, pctdistance=0.5)
plt.title("Income", fontsize=14);
pie.savefig("income.png")

## Numerical Features

In [None]:
df['sex'] = df['sex'].map({"Female":0, "Male":1})

pie, ax = plt.subplots(figsize=[10,6])
labels = df.groupby(['sex']).count().index.values
plt.pie(x = df.groupby(['sex']).count()['age'].values, autopct="%.1f%%", labels = labels, pctdistance=0.5)
plt.title("Gender", fontsize=14);
pie.savefig("gender.png")

In [None]:
hist, ax = plt.subplots(figsize=[10,6])
plt.hist(x = df.age.values, bins = 15, )
plt.title("Age", fontsize=14);
hist.savefig("age.png")

In [None]:
sns.boxplot(x = df.fnlwgt.values)

print('Percentage of rows with fnlwgt greater than 600k : %.2f' % ((df.fnlwgt.size - df.fnlwgt[df.fnlwgt < 600000].size)/df.fnlwgt.size *100))

df = df[df.fnlwgt < 600000]

hist, ax = plt.subplots(figsize=[10,6])
plt.hist(x = df.fnlwgt.values, bins = 15, )
plt.title("fnlwgt (truncated)", fontsize=14);
hist.savefig("fnlwgt.png")

In [None]:
sns.boxplot(x = df['education-num'].values)

hist, ax = plt.subplots(figsize=[10,6])
plt.hist(x = df['education-num'].values, bins = 15, )
plt.title("education-num", fontsize=14);
hist.savefig("education-num.png")

In [None]:
sns.boxplot(x = df['capital-gain'].values)
print('Percentage of rows with capital-gain greater than 0 : %.2f' % ((df['capital-gain'].size - df['capital-gain'][df['capital-gain'] == 0].size)/df['capital-gain'].size *100))

hist, ax = plt.subplots(figsize=[10,6])
plt.hist(x = df['capital-gain'].values, bins = 15, )
plt.title("capital-gain", fontsize=14);
#hist.savefig("capital-gain.png")

df.loc[(df['capital-gain'] > 0),'capital-gain'] = 1

pie, ax = plt.subplots(figsize=[10,6])
labels = ['capital-gain = 0', 'capital-gain > 0']
plt.pie(x = df.groupby(['capital-gain']).count()['age'].values, autopct="%.1f%%", labels = labels, pctdistance=0.5)
plt.title("capital-gain", fontsize=14);
pie.savefig("capital-gain.png")

In [None]:
hist, ax = plt.subplots(figsize=[10,6])
plt.hist(x = df['hours-per-week'].values, bins = 15, )
plt.title("hours-per-week", fontsize=14);
hist.savefig("hours-per-week.png")

In [None]:
sns.boxplot(x = df['capital-loss'].values)
print('Percentage of rows with capital-loss greater than 0 : %.2f' % ((df['capital-loss'].size - df['capital-loss'][df['capital-loss'] == 0].size)/df['capital-gain'].size *100))

hist, ax = plt.subplots(figsize=[10,6])
plt.hist(x = df['capital-loss'].values, bins = 15, )
plt.title("capital-loss", fontsize=14);
#hist.savefig("capital-gain.png")

df.loc[(df['capital-loss'] > 0),'capital-loss'] = 1

pie, ax = plt.subplots(figsize=[10,6])
labels = ['capital-loss = 0', 'capital-loss > 0']
plt.pie(x = df.groupby(['capital-loss']).count()['age'].values, autopct="%.1f%%", labels = labels, pctdistance=0.5)
plt.title("capital-loss", fontsize=14);
pie.savefig("capital-loss.png")

## Categorical features

In [None]:
df = df[df.workclass != 'Never-worked']
df = df[df.workclass != 'Without-pay']
pie, ax = plt.subplots(figsize=[10,6])
labels = df.groupby(['workclass']).count().index.values
plt.pie(x = df.groupby(['workclass']).count()['education'].values, autopct="%.1f%%", labels = labels, pctdistance=0.7)
plt.title("workclasses", fontsize=14);
pie.savefig("workclass.png")


In [None]:
pie, ax = plt.subplots(figsize=[10,6])
labels = df.groupby(['education']).count().index.values
plt.pie(x = df.groupby(['education']).count()['age'].values, autopct="%.1f%%", labels = labels, pctdistance=0.7)
plt.title("education", fontsize=14);
pie.savefig("education.png")

df.education.unique()

In [None]:
sns.barplot(x = df.groupby(['education']).count().index.values, y = df.groupby(['education']).count()['age'].values)

In [None]:
pie, ax = plt.subplots(figsize=[10,6])
labels = df.groupby(['marital-status']).count().index.values
plt.pie(x = df.groupby(['marital-status']).count()['age'].values, autopct="%.1f%%", labels = labels, pctdistance=0.7)
plt.title("marital-status", fontsize=14);
pie.savefig("marital-status.png")

df['marital-status'].replace({'Married-AF-spouse':'others', 'Widowed':'others','Separated':'others','Married-spouse-absent':'others'},inplace = True)


In [None]:
pie, ax = plt.subplots(figsize=[10,6])
labels = df.groupby(['occupation']).count().index.values
plt.pie(x = df.groupby(['occupation']).count()['age'].values, autopct="%.1f%%", labels = labels, pctdistance=0.7)
plt.title("occupation", fontsize=14);
pie.savefig("occupation.png")
df = df[df.occupation != 'Armed-Forces']
df = df[df.occupation != 'Priv-house-serv']

In [None]:
pie, ax = plt.subplots(figsize=[10,6])
labels = df.groupby(['relationship']).count().index.values
plt.pie(x = df.groupby(['relationship']).count()['age'].values, autopct="%.1f%%", labels = labels, pctdistance=0.7)
plt.title("relationship", fontsize=14);
pie.savefig("relationship.png")

df['relationship'].replace({'Husband':'married-no-child','Wife':'married-no-child'},inplace = True)

In [None]:
pie, ax = plt.subplots(figsize=[10,6])
labels = df.groupby(['race']).count().index.values
plt.pie(x = df.groupby(['race']).count()['age'].values, autopct="%.1f%%", labels = labels, pctdistance=0.7)
plt.title("race", fontsize=14);
pie.savefig("race.png")

df['race'].replace({'Asian-Pac-Islander':'Other', 'Amer-Indian-Eskimo':'Other'},inplace = True)

In [None]:
pie, ax = plt.subplots(figsize=[10,6])
labels = df.groupby(['native-country']).count().index.values
plt.pie(x = df.groupby(['native-country']).count()['age'].values, autopct="%.1f%%", labels = labels, pctdistance=0.7)
plt.title("native-country", fontsize=14);
pie.savefig("native-country.png")

df.loc[(df['native-country'] != 'United-States'),'native-country'] = 'others'

In [None]:
df.tail()

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_df_cat = pd.DataFrame(OH_encoder.fit_transform(df[cat_features]))


# One-hot encoding removed index; put it back
OH_df_cat.index = df.index

df_cat_num = pd.concat([df[num_features], OH_df_cat], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_cat_num, df.income.map({"<=50K":0, ">50K":1}),
                                                      train_size=0.7, test_size=0.3,
                                                      random_state=0)
df.income.unique()

In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import confusion_matrix

def evaluation(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test).round().astype(int)
    print(mean_absolute_error(y_pred,y_test))
    print(confusion_matrix(y_test, y_pred))

model = RandomForestRegressor(random_state=1)

evaluation(model)

model = XGBRegressor()

evaluation(model)


