# INCOME CLASSIFICATION

### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

### Getting our Data

In [None]:
df = pd.read_csv('../input/income-classification/income_evaluation.csv', encoding = 'unicode_escape')
df

###  Data Preprocessing

In [None]:
df.isnull().any()  # checking for null values

In [None]:
# no null values are present
# we need to do label encoding for columns that don't have int/float type values

In [None]:
df.info()

In [None]:
# checking from above and label encoding columns that are required to be encoded
a = [' workclass', ' education', ' marital-status', ' occupation',' relationship',' race',' sex',' native-country',' income']
for i in a:
    df[i] = df[i].astype('|S')
    df[i] = label_encoder.fit_transform(df[i])
df # getting final encoded dataframe

In [None]:
df.columns

In [None]:
# checking vif
variables = df[['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country']]
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['Features'] = variables.columns

vif

In [None]:
# now, we'll drop columns which have vif>10
df = df.drop([' education-num',' race',' hours-per-week',' native-country'], axis=1)
df

In [None]:
# removing all outliners
df = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]
df

### Data Visualization

In [None]:
corr = df.corr()
plt.subplots(figsize=(15,10))
sns.heatmap(corr, annot=True)  # quantifying the relationship

### Splitting Data for Training and Testing

In [None]:
data = df.values
X,y = data[:,:-1], data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)  # splitting in the ration 80:20

### Model

In [None]:
classifier = RandomForestClassifier(n_estimators = 50, random_state=0)
classifier.fit(X_train, y_train)

### Making Predictions and Checking Accuracy

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
roc_auc_score(y_test, y_pred)

# Predictions are 75.31% accurate.

### Results' Visualization

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
plt.figure(figsize=(6,6))
sns.heatmap(cm, annot=True, fmt=".0f", linewidths=0.5, square = True, cmap = 'Pastel1')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {0}'.format(roc_auc_score(y_test, y_pred))
plt.title(all_sample_title, size = 15)