# 1.Problem Statement

### This is a binary classification problem where the target variable is whether a person's income is lower than or equal to 50K (<=50K) or higher than 50K (>50K). 

### The models that will be used include LogisticRegression, DecisionTreeClassifier, RandomForestClassifier and GradientBoostingClassifier. The accuracy for each of the models will be evaluated and the best performing model will be selected as the final model.

# 2.Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix

from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

from yellowbrick.model_selection import FeatureImportances

#### Ignore warnings

In [None]:
import warnings
warnings.filterwarnings('ignore')

# 3.Import Dataset

In [None]:
df = pd.read_csv('/kaggle/input/income-classification/income_evaluation.csv')

# 4.Overview of the dataset

In [None]:
df.head()

## Column names

In [None]:
df.columns

#### If you can see in the output above, there are unnecessary whitespaces present in the column names which may lead to an error while indexing the data in the further steps. Below, the whitespaces have been removed manually.

#### Then, the hyphens in the column names have been replaced with underscores.

In [None]:
df.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain',
              'capital-loss','hours-per-week','native-country','income']

df.columns = df.columns.str.replace('-','_')

## Dataset with the formatted column names

In [None]:
df.head()

## Shape of the dataset

In [None]:
df.shape

## General information about the features present in the dataset

In [None]:
df.info()

## Overview of the numerical columns

In [None]:
df.describe().T

#### Creating a list of the categorical variables for easy indexing

In [None]:
categorical = [var for var in df.columns if df[var].dtype=='O']

print(categorical)

## Overview of the categorical variables

In [None]:
df[categorical].describe()

## Checking the dataset for any missing values

In [None]:
df.isnull().sum()

#### As seen in the output above, the dataset does not seem to have any missing data.

#### Creating a copy of the dataset for further processing

In [None]:
data = df.copy()

# 4.Exploratory Data Analysis

#### In the following steps, each feature will be studied individually to get a better understanding about the dataset.

#### The general trend of the exploration will be:
1. Distplot for numerical data
2. Checking unique categories in categorical data followed by a countplot 

In [None]:
sns.countplot(data['income'])

#### As seen in the plot above, there is a moderate imbalance in the two classes of the target variable.

In [None]:
sns.distplot(data['age'])

#### The distplot does not give much information about the age variable. To gain some more insight, bins of age groups are created manually. 

In [None]:
labels = ['10-20','20-30','30-40','40-50','50-60','60-70','70-80','80-90']
bins = [10,20,30,40,50,60,70,80,90]
freq_df = data.groupby(pd.cut(data['age'],bins = bins,labels = labels)).size()
freq_df = freq_df.reset_index(name = 'count')

In [None]:
freq_df

In [None]:
plt.bar(freq_df['age'],freq_df['count'])

#### This plot seems better the previous distplot. 
#### As seen from the plot, most of the subjects are concentrated between the ages 20 to 60.

In [None]:
data['workclass'].value_counts()

#### When checked previously, there were no missing values but '?' is present as observations in the data.

#### Replace '?' with 'Unknown'

In [None]:
data['workclass'] = data.workclass.str.replace('?','Unknown')

In [None]:
sns.countplot(data['workclass'])
plt.xticks(rotation = 90)

### As seen in the data, majority of the 'workclass' is 'Private'. Some of the categories have very few observations and lead to an unnecessary increase in the cardinality of the variable. The problem of cardinality will be addressed later.

In [None]:
sns.distplot(data['fnlwgt'])

#### I could not find more information about this variable anywhere so I don't exactly know what it represents.

In [None]:
data['education'].value_counts()

In [None]:
sns.countplot(data['education'])
plt.xticks(rotation = 90)

### The variable 'education' also has a lot of categories with most of the data being concentrated in specific categories. This variable also has high cardinality.

In [None]:
data['education_num'].value_counts()

In [None]:
sns.countplot(data['education_num'])

### The same trend is seen in the 'education_num' variable as most of the data is concentrated in specific categories.

In [None]:
data['marital_status'].value_counts()

In [None]:
sns.countplot(data['marital_status'])
plt.xticks(rotation = 90)

### The 'marital_status' variable also exhibits the same trend. 

In [None]:
data['occupation'].value_counts()

#### This variable also contains '?' as observations.

#### Replacing '?' with unknown

In [None]:
data['occupation'] = data.occupation.str.replace('?','Unknown')

In [None]:
sns.countplot(data['occupation'])
plt.xticks(rotation = 90)

### The variable 'occupation' also exhibits high cardinality.

In [None]:
data['relationship'].value_counts()

In [None]:
sns.countplot(data['relationship'])
plt.xticks(rotation = 90)

In [None]:
data['race'].value_counts()

In [None]:
sns.countplot(data['race'])
plt.xticks(rotation = 90)

#### Most of the subjects are white.

In [None]:
data['sex'].value_counts()

In [None]:
sns.countplot(data['sex'])

#### Male subjects are more than female subjects.

In [None]:
data['capital_loss'].value_counts().nlargest(15)

### Out of the 32561 observations, 31042 are 0.

In [None]:
data['capital_gain'].value_counts().nlargest(15)

### Out of the 32561 observations, 29849 are 0.

In [None]:
data['native_country'].value_counts()

In [None]:
data['native_country'] = data.native_country.str.replace('?','Unknown')

In [None]:
sns.countplot(data['native_country'])
plt.xticks(rotation = 90)

### The 'native_country' variable has 42 categories and most of the data is concentrated in the 'United States' category. This is a very high number and will affect the accuracy of the model. 

# 5.Baseline Models

### Models fill be fit on the raw data to get a baseline for each model. This will help in understanding if the models improved after performing some feature engineering.

#### Seperating the data into dependent and independent variables

In [None]:
x = data.drop('income',axis = 1)
y = data['income']

### As some of the independent variables are categorical, they have to be converted into numerical data as the models require the data to be numeric for fitting. There are multiple ways to do this and here the 'pd.getdummies()' function is used. 

In [None]:
x_dummy = pd.get_dummies(x)

### Train-Test Split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_dummy,y,test_size = 0.2,random_state = 0)

## Defining a function to evalute the models

#### This is not a necessary step but makes process easier as it prevents from writing the same lines of code for every model.

In [None]:
def fit_model(model,x,y):
    model.fit(x,y)
    y_pred = model.predict(x_test)
    print("Accuracy: ",model.score(x_test,y_test))
    print("------------------------------")
    print("Classification Report")
    print("------------------------------")
    print(classification_report(y_test,y_pred))
    print("------------------------------")
    print("Confusion Matrix")
    print("------------------------------")
    print(confusion_matrix(y_test,y_pred))
    print("------------------------------")

## Logistic Regression

In [None]:
lr = LogisticRegression(max_iter = 1000)

fit_model(lr,x_train,y_train)

Logistic Regression gives an accuracy of 0.79.

The precision and recall for the majority class (<=50K) in the target varible is good but is not satisfactory for the minority class (>50K).

## Decision Tree Classifier

In [None]:
dtree = DecisionTreeClassifier()

fit_model(dtree,x_train,y_train)

Decision Tree gives an accuracy of 0.81.

The precision and recall for the majority class (<=50K) in the target varible is good and there is an improvment in the minority class (>50K) as compared to Logistic Regression.

## Random Forest Classifier

In [None]:
rf = RandomForestClassifier(random_state = 0)

fit_model(rf,x_train,y_train)

Random Forest gives an accuracy of 0.84.

There is also an improvement in the precision of the minority class (>50K).

## Gradient Boosting Classifier

In [None]:
gbm = GradientBoostingClassifier(random_state = 0)

fit_model(gbm,x_train,y_train)

Gradient Boosting Classifier gives an accuracy of 0.86.

There is also an improvement in the precision and recall.

### This was the performance of the models on the data without any transformations. After, performing transformations on the data it is expected that the accuracy of the models will increase.

# 6.Reducing the cardinality of variables

### During the initial analysis it was seen that some of the variables have a lot of categories while the data is concentrated in specific categories and some of the categories have very less data.The presence of a large number of categories affects the accuracy of the model.

The variables with high cardinality were:
1. workclass
2. education
3. education_num
4. marital_status
5. occupation
6. race
7. native_country

In the following steps, the cardinality of the variables will be reduced individually.

All the categories with less than 5% of the data will be clubbed together as one category called 'Other'.

In [None]:
data1 = data.copy()

In [None]:
data1['workclass'].value_counts() / len(data1)

In [None]:
names = ['State-gov','Self-emp-inc','Federal-gov','Without-pay','Never-worked']

for i in names:
    data1['workclass'] = data1.workclass.str.replace(i,'Other')

In [None]:
fig, ax =plt.subplots(1,2,figsize = (25,10))
sns.countplot(data['workclass'],ax = ax[0])
sns.countplot(data1['workclass'],ax = ax[1])

'workclass' reduced from 9 categories to 5 categories.

In [None]:
names1 = ['11th','9th','7th-8th','5th-6th','10th','1st-4th','Preschool','12th']

for i in names1:
    data1['education'] = data1.education.str.replace(i,'Non Graduate')
    
names2 = ['Assoc-acdm','Assoc-voc','Doctorate','Prof-school']

for i in names2:
    data1['education'] = data1.education.str.replace(i,'Other')

In [None]:
fig, ax =plt.subplots(1,2,figsize = (25,10))
sns.countplot(data['education'],ax = ax[0])
sns.countplot(data1['education'],ax = ax[1])

'education' reduced from 16 categories to 6 categories.

'education_num' has categories from 1 to 16. The categories will be binned into 4 seperate categories each category containing 4 categories serially.

In [None]:
names1 = [1,2,3,4]

for i in names1:
    data1['education_num'] = data1.education_num.replace(i,'1-4')
    
names2 = [5,6,7,8]

for i in names2:
    data1['education_num'] = data1.education_num.replace(i,'5-8')
    
names3 = [9,10,11,12]

for i in names3:
    data1['education_num'] = data1.education_num.replace(i,'9-12')
    
names4 = [13,14,15,16]

for i in names4:
    data1['education_num'] = data1.education_num.replace(i,'13-16')

In [None]:
fig, ax =plt.subplots(1,2,figsize = (25,10))
sns.countplot(data['education_num'],ax = ax[0])
sns.countplot(data1['education_num'],ax = ax[1])

'education_num' reduced from 16 categories to 4 categories.

In [None]:
data1['marital_status'].value_counts() / len(data1)

In [None]:
names = ['Married-spouse-absent','Separated','Married-AF-spouse','Widowed']

for i in names:
    data1['marital_status'] = data1.marital_status.str.replace(i,'Other')

In [None]:
fig, ax =plt.subplots(1,2,figsize = (25,10))
sns.countplot(data['marital_status'],ax = ax[0])
sns.countplot(data1['marital_status'],ax = ax[1])

'marital_status' reduced from 7 categories to 4 categories.

In [None]:
data1['occupation'].value_counts() / len(data1)

In [None]:
names = ['Handlers-cleaners','Transport-moving','Farming-fishing','Tech-support','Protective-serv','Armed-Forces','Priv-house-serv']

for i in names:
    data1['occupation'] = data1.occupation.str.replace(i,'Other')

In [None]:
fig, ax =plt.subplots(1,2,figsize = (25,10))
sns.countplot(data['occupation'],ax = ax[0])
sns.countplot(data1['occupation'],ax = ax[1])

'occupation' reduced from 15 categories to 9 categories.

In [None]:
data1['race'].value_counts() / len(data1)

In [None]:
names = ['Asian-Pac-Islander','Amer-Indian-Eskimo','Other']

for i in names:
    data1['race'] = data1.race.str.replace(i,'Other')

In [None]:
fig, ax =plt.subplots(1,2,figsize = (25,10))
sns.countplot(data['race'],ax = ax[0])
sns.countplot(data1['race'],ax = ax[1])

'race' reduced from 5 categories to 3 categories.

In [None]:
data1['native_country'].value_counts() / len(data1)

In [None]:
na = ['Cuba','Jamaica','Puerto-Rico','Honduras','Haiti','Dominican-Republic','El-Salvador','Guatemala','Nicaragua','United-States',
      'Mexico','Canada']

for i in na:
    data1['native_country'] = data1.native_country.str.replace(i,'NAmerica')
    
data1['native_country'] = data1.native_country.str.strip().replace('Outlying-US(Guam-USVI-etc)','Outlying-US')
data1['native_country'] = data1.native_country.str.replace('Outlying-US','NAmerica')

sa = ['Trinadad&Tobago','Columbia','Ecuador','Peru']

for i in sa:
    data1['native_country'] = data1.native_country.str.replace(i,'SAmerica')
    
ai = ['India','South','Iran','Philippines','Cambodia','Thailand','Laos','Taiwan','China','Japan','Vietnam','Hong']

for i in ai:
    data1['native_country'] = data1.native_country.str.replace(i,'Asia')
    
eu = ['England','Germany','Italy','Poland','Portugal','France','Yugoslavia','Scotland','Greece','Ireland','Hungary','Holand-Netherlands']

for i in eu:
    data1['native_country'] = data1.native_country.str.replace(i,'Europe')

In [None]:
data1.rename(columns = {'native_country':'region'}, inplace = True) 

In [None]:
fig, ax =plt.subplots(1,2,figsize = (25,10))
sns.countplot(data['native_country'],ax = ax[0])
sns.countplot(data1['region'],ax = ax[1])

'native_country' reduced from 42 categories to 5 categories

# 7.Evaluating models on updated dataset

In [None]:
x = data1.drop('income',axis = 1)
y = data1['income']

In [None]:
x_dummy = pd.get_dummies(x)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_dummy,y,test_size = 0.2,random_state = 0)

## Logistic Regression

In [None]:
lr = LogisticRegression(max_iter = 1000)

fit_model(lr,x_train,y_train)

#### The accuracy of the model improved from 0.79 to 0.83 with an increase in the precision and recall of the minority class (>50K) too.

In [None]:
dtree = DecisionTreeClassifier()

fit_model(dtree,x_train,y_train)

#### The accuracy of the decision tree decreased by a small margin but it can be improved by tuning the hyperparameters.

In [None]:
rf = RandomForestClassifier(random_state = 0)

fit_model(rf,x_train,y_train)

#### The accuracy of random forest also decreased by a small margin but it can be increased.

In [None]:
gbm = GradientBoostingClassifier(random_state = 0)

fit_model(gbm,x_train,y_train)

#### The accuracy of gradient boosting classifier improved with an improvement in the precision of the minority class (>50K).

### After reducing the cardinality of the variables:
1. Accuracy of Logistic Regression improved significantly
2. Accuracy of Decision Tree Classifier reduced by a small margin
3. Accuracy of Random Forest Classifier reduced by a small margin
4. Accuracy of Gradient Boosting Classifier improved by a small margin

# 8.Hyperparameter Tuning

## Logistic Regression

 #### Checking for the best value of solver

In [None]:
#param_grid = {'solver':['newton-cg','lblinear','lbfgs']}

In [None]:
#lr = LogisticRegression(max_iter = 1000)

#gs = GridSearchCV(lr,param_grid,cv = 5,scoring = 'accuracy',n_jobs = -1,verbose = True)

#gs.fit(x_train,y_train)

In [None]:
#gs.best_params_

#### 'newton-cg' is the best performing solver

#### Checking for the best value of penalty and C

In [None]:
#param_grid = {'penalty':['l1','l2'],
              #'C':[100.0,10.0,1.0,0.1,0.01]
    
#}

In [None]:
#lr = LogisticRegression(solver = 'newton-cg',penalty = 'l2',max_iter = 1000)

#gs = GridSearchCV(lr,param_grid,cv = 5,scoring = 'accuracy',n_jobs = -1,verbose = True)

#gs.fit(x_train,y_train)

In [None]:
#gs.best_params_

#### Best performing values of 'C' and 'penalty' are 0.1 and l2 respectively.

In [None]:
lr = LogisticRegression(C = 0.1,solver = 'newton-cg',penalty = 'l2',max_iter = 1000)

fit_model(lr,x_train,y_train)

#### The accuracy of Logistic Regression improved from 0.832 to 0.846 after tuning the hyperparameters.

## Decision Tree Classifier

In [None]:
#param_grid = {'criterion':['gini','entropy'],
              #'splitter':['best','random'],
              #'max_features':['auto','sqrt','log2'],
              #'max_depth': np.arange(2,7,1),
              #'min_samples_split': np.arange(2,10,1),
              #'min_samples_leaf': np.arange(2,7,1)
#}

In [None]:
#dtree = DecisionTreeClassifier()

#gs = GridSearchCV(dtree,param_grid,cv = 5,scoring = 'accuracy',n_jobs = -1,verbose = True)

#gs.fit(x_train,y_train)

In [None]:
#gs.best_params_

In [None]:
dtree = DecisionTreeClassifier(criterion = 'gini',max_depth = 6,max_features = 'auto',min_samples_leaf = 4,min_samples_split = 5,
                               splitter = 'best')

fit_model(dtree,x_train,y_train)

#### The accuracy of Decision Tree Classifier improved from 0.807 to 0.823 after tuning the hyperparameters.

## Random Forest Classifier

In [None]:
#param_grid = {'criterion':['gini','entropy'],
              #'bootstrap': [True,False],
              #'n_estimators':[10,100,200,500,1000],
              #'max_features':['auto','sqrt','log2'],
              #'max_depth': [2,3,4,5,6,7,None],
              #'min_samples_split': np.arange(2,10,1),
              #'min_samples_leaf': np.arange(2,7,1)
#}

In [None]:
#rf = RandomForestClassifier(random_state = 0)

#gs = GridSearchCV(rf,param_grid,cv = 5,scoring = 'accuracy',n_jobs = -1,verbose = True)

#gs.fit(x_train,y_train)

In [None]:
#gs.best_params_

In [None]:
rf = RandomForestClassifier(bootstrap =True,criterion = 'entropy',max_depth = None,min_samples_leaf = 2,min_samples_split = 100,
                            max_features = 17,n_estimators = 10,random_state = 0)

fit_model(rf,x_train,y_train)

#### The accuracy of Random Forest Classifier improved from 0.847 to 0.858 after tuning the hyperparameters.

## Gradient Boosting Classifier

In [None]:
#param_grid = {'n_estimators':range(20,81,10),
              #'max_depth':range(5,16,2),
              #'min_samples_split':range(1000,2100,200),
              #'min_samples_leaf':range(30,71,10),
              #'max_features':[range(7,20,2),None],
              #'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]
             #}

In [None]:
#gbm = GradientBoostingClassifier(n_estimators = 80,max_depth = 13,min_samples_split = 1000,min_samples_leaf = 30,max_features = None,
                                 #random_state = 0)

#gs = GridSearchCV(gbm,param_grid,cv = 5,scoring = 'accuracy',n_jobs = -1,verbose = True)

#gs.fit(x_train,y_train)

In [None]:
#gs.best_params_

In [None]:
gbm = GradientBoostingClassifier(n_estimators = 80,max_depth = 13,min_samples_split = 1000,min_samples_leaf = 30,max_features = None,
                                 random_state = 0)

fit_model(gbm,x_train,y_train)

#### The accuracy of Gradient Boosting Classifier increased from 0.861 to 0.865 after tuning the hyperparameters. 

# 9.Conclusion

#### The best performing model among the 4 models was the Gradient Boosting Classifier with an accuracy of 0.865.

#### Classification report and confusion matrix of the best performing model 

In [None]:
print(plot_confusion_matrix(gbm,x_test,y_test))
print(classification_report(y_test,gbm.predict(x_test)))

### Feature Importance

In [None]:
plt.rcParams['figure.figsize'] = (12,8)
plt.style.use("ggplot")

gbm = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.08, loss='deviance', max_depth=5,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=30, min_samples_split=1000,
                           min_weight_fraction_leaf=0.0, n_estimators=80,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

viz = FeatureImportances(gbm)
viz.fit(x_train, y_train)
viz.show();

#### I hope you found the kernel useful. Any suggestions or improvements are welcome.