In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv("/kaggle/input/income-classification/income_evaluation.csv")

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.columns=data.columns.str.strip()

In [None]:
data['income'].values

In [None]:
data['income']=[1 if x ==' >50K' else 0 for x in data['income'].values]

In [None]:
sns.countplot(data=data, x='income')

In [None]:
data.info()

In [None]:
#cols = ['age', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(data, size = 2.5)
plt.show();

In [None]:
# Missing data check 

print(data.isnull().sum())

# no missing data

In [None]:
# select only numerical variables
num_col = data.select_dtypes('number').columns
num_col

In [None]:
# select only categorical variables
cat_col = data.select_dtypes('object').columns
cat_col

In [None]:
fig = plt.figure(figsize = (5,3))
sns.barplot(x = 'income', y = 'age', data = data) 

In [None]:
sns.barplot(x = 'income', y = 'education-num', data = data) 

In [None]:
sns.barplot(x = 'income', y = 'capital-gain', data = data) 

In [None]:
sns.barplot(x = 'income', y = 'capital-loss', data = data) 

In [None]:
sns.barplot(x = 'income', y = 'hours-per-week', data = data) 

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for col in cat_col:
    data[col]= label_encoder.fit_transform(data[col])
data.head()

In [None]:
X = data.drop('income', axis=1)
y = data['income']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, pred_rfc))

The accuracy score of Random Forest model is 85.7% 

In [None]:
#Let's see how our model performed
print(classification_report(y_test, pred_rfc))

In [None]:
#Confusion matrix for the random forest classification
print(confusion_matrix(y_test, pred_rfc))

In [None]:
# Create a pd.Series of features importances

#1. MDI(Mean Decrease in Impurity) Importance
importances = pd.Series(data=rfc.feature_importances_,
                        index= X_train.columns)

# Sort importances
importances_sorted = importances.sort_values()

# Draw a horizontal barplot of importances_sorted
importances_sorted.plot(kind='barh', color='lightgreen')
plt.title('Features Importances')
plt.show()

fnlwgt is the most important feature for income. 

In [None]:
#2. Permutation Importance
from sklearn.inspection import permutation_importance

result = permutation_importance(rfc, X_train, y_train, n_repeats=10,
                                random_state=42)

perm_sorted_idx = result.importances_mean.argsort()

tree_importance_sorted_idx = np.argsort(rfc.feature_importances_)
tree_indices = np.arange(0, len(rfc.feature_importances_)) + 0.5

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
ax1.barh(tree_indices,
         rfc.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(data.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(rfc.feature_importances_)))
ax2.boxplot(result.importances[perm_sorted_idx].T, vert=False,
            labels=data.columns[perm_sorted_idx])
fig.tight_layout()
plt.show()

# GridSearchCV

In [None]:
# Define the dictionary 'params_rf'

rfc = RandomForestClassifier(n_estimators=200)

params_rf = {'n_estimators':[100, 350, 500],
             'max_features':['log2', 'auto', 'sqrt'],
             'min_samples_leaf':[2, 10, 30]
            }

In [None]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Instantiate grid_rf
grid_rf = GridSearchCV(estimator=rfc,
                       param_grid=params_rf,
                       scoring='accuracy',
                       cv=3,
                       verbose=1,
                       n_jobs=-1)

In [None]:
# Predict test set labels

grid_rf.fit(X_train, y_train)

In [None]:
# summarize the results of the grid search
print("The best score is {}".format(grid_rf.best_score_))
print("The best hyper parameter setting is {}".format(grid_rf.best_params_))