In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/income'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/income/train.csv')

In [None]:
df.columns

In [None]:
df_orig = df.copy()

In [None]:
df.head()

In [None]:
df.rename(columns={"income_>50K": "income >50K"}, inplace=True, errors='raise')

### LIBRARY IMPORTS

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score, classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,KFold,StratifiedKFold
import pandas_profiling as pp
import warnings
warnings.filterwarnings('ignore')
import missingno as msno #Visualize null

sns.set_style('ticks') #No grid with ticks
print(sns.__version__)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.isna().any()

# DATA PREPARATIONS

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
cols = ['age', 'workclass', 'education', 'educational-num', 'marital-status',
       'occupation', 'relationship', 'race', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'income >50K']

for i in cols:
  print(df[i].value_counts())

In [None]:
new_data = df.dropna()
new_data.info()

In [None]:
new_data.isna().sum()

In [None]:
new_data.dtypes

In [None]:
labelencoder = LabelEncoder()

df_max_scaled = new_data.copy()

## FEATURE ENGINEERING

df_max_scaled["workclass"] = df_max_scaled["workclass"].replace(['Self-emp-not-inc','Self-emp-inc'], 'Self-emp')
df_max_scaled["workclass"] = df_max_scaled["workclass"].replace(['Never-worked','Without-pay'], 'Un-emp')
df_max_scaled["workclass"] = df_max_scaled["workclass"].replace(['State-gov','Federal-gov','Local-gov'], 'Government')

df_max_scaled['education'] = df_max_scaled["education"].replace(['12th','7th-8th','9th','10th', '11th','5th-6th','1st-4th','Preschool'], '<HS')
df_max_scaled['education'] = df_max_scaled["education"].replace(['Assoc-voc','Assoc-acdm','Some-college'], 'Associate')
df_max_scaled['education'] = df_max_scaled["education"].replace(['Masters','Prof-school'], 'Mas-Prof')

df_max_scaled['marital-status'] = df_max_scaled["marital-status"].replace(['Divorced','Separated'], 'Sep-Div')
df_max_scaled['marital-status'] = df_max_scaled["marital-status"].replace(['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse'], 'Married')

df_max_scaled['relationship'] = df_max_scaled["relationship"].replace(['Husband','Wife'], 'Married')

df_max_scaled['native-country'] = df_max_scaled['native-country'].replace(['England', 'Italty', 'Germany', 'France','Yugoslavia', 'Poland', 'Greece', 'Ireland', 'Scotland',
       'Hungary','Holand-Netherlands','Portugal'], 'Europe')
df_max_scaled['native-country'] = df_max_scaled['native-country'].replace(['China', 'Philippines','Vietnam','Thailand','Taiwan','Laos','Cambodia','Japan', 'Hong','India','Iran'], 'Asia')
df_max_scaled['native-country'] = df_max_scaled['native-country'].replace(['Jamaica','Dominican-Republic','Cuba','Haiti','Trinadad&Tobago', 'Puerto-Rico'], 'Carribean')
df_max_scaled['native-country'] = df_max_scaled['native-country'].replace(['United-States','Canada'], 'N.America')
df_max_scaled['native-country'] = df_max_scaled['native-country'].replace(['Mexico','Honduras','El-Salvador','Guatemala','Nicaragua'], 'C.America')
df_max_scaled['native-country'] = df_max_scaled['native-country'].replace(['Columbia','Ecuador','Peru'], 'S.America')

df_max_scaled = df_max_scaled.astype({
    'workclass' : 'category',
    'education' : 'category',
    'marital-status' : 'category',
    'relationship' : 'category',
    'native-country' : 'category',
    'race' : 'category',
    'occupation' : 'category'
})

cat_cols = [i for i  in df_max_scaled.columns if df_max_scaled[i].dtype not in ['int64', 'float64']]

for col in cat_cols:
  df_max_scaled[col + "-cat"] = labelencoder.fit_transform(df_max_scaled[col])

num_cols = [col for col in df_max_scaled.columns if df_max_scaled[col].dtype in ['int', 'float']]

for i in cat_cols:
  df_max_scaled.drop([i], axis= 1, inplace= True)

for col in num_cols:
  df_max_scaled[col] = df_max_scaled[col] / df_max_scaled[col].abs().max() 

df_max_scaled.head()

In [None]:
df_max_scaled.info()

# EXPLORATORY DATA ANALYSIS

## UNIVARIATE

#### *~ NUMERIC DATA VS CATEGORICAL DATA*


##### numeric

In [None]:
num_cols = [col for col in df_max_scaled.columns if df_max_scaled[col].dtype in ['int64','float64']]

cat_cols = [col for col in df_max_scaled.columns if df_max_scaled[col].dtype not in ['int64', 'float64']]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

plt.figure(figsize = (50, 30))
plotnumber = 1

for i in num_cols:
    if plotnumber <= 14:
        ax = plt.subplot(7, 2, plotnumber)
        sns.distplot(df_max_scaled[i])
        plt.xlabel(i, fontsize = 15)
        
    plotnumber += 1
    
plt.tight_layout()
plt.show()

In [None]:
def dist_box(data):
 # function plots a combined graph for univariate analysis of continous variable 
 #to check spread, central tendency , dispersion and outliers  
    Name=data.name.upper()
    fig,(ax_box,ax_dis)  =plt.subplots(2,1,gridspec_kw = {"height_ratios": (.25, .75)},figsize=(8, 5))
    mean=data.mean()
    median=data.median()
    mode=data.mode().tolist()[0]
    fig.suptitle("SPREAD OF DATA FOR "+ Name  , fontsize=18, fontweight='bold')
    sns.boxplot(x=data,showmeans=True, orient='h',color="violet",ax=ax_box)
    ax_box.set(xlabel='')
    sns.distplot(data,kde=False,color='blue',ax=ax_dis)
    ax_dis.axvline(mean, color='r', linestyle='--',linewidth=2)
    ax_dis.axvline(median, color='g', linestyle='-',linewidth=2)
    ax_dis.axvline(mode, color='y', linestyle='-',linewidth=2)
    plt.legend({'Mean':mean,'Median':median,'Mode':mode})

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

for i in range(len(num_cols)):
    dist_box(df_max_scaled[num_cols[i]])

In [None]:
# heatmap
plt.figure(figsize = (16, 7))

corr = df_max_scaled.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(corr, mask = mask, annot = True, fmt = '0.2g', linewidths = 1)
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.heatmap(df_max_scaled.corr(),annot=True ,cmap="YlGn")
plt.show()

In [None]:
## AGE VS INCOME

sns.boxplot(x=df_max_scaled['income >50K'],
              y=df_max_scaled['age'])

In [None]:
## CAPITAL GAIN VS INCOME

sns.boxplot(x=df_max_scaled['income >50K'],
              y=df_max_scaled['capital-gain'])

In [None]:
## HOURS PER WEEK VS INCOME

sns.boxplot(x=df_max_scaled['income >50K'],
              y=df_max_scaled['capital-gain'])

# MODELLING

We will use the following algos:


1.   Random Forest Classifier
2.   Logistic Regression
3.   Naive Bayes
4.   Extreme Gradient Booster
5.   KNN (K - Nearest Neighbours)
6.   Decision Tree
7.   SVM





In [None]:
df_max_scaled.info()

In [None]:
data = df_max_scaled.drop_duplicates()

In [None]:
data.head()

In [None]:
y = data["income >50K"]
X = data.drop('income >50K',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)

In [None]:
df_max_scaled.shape

## *1. RANDOM FOREST CLASSIFIER*

In [None]:
## RANDOM FOREST

m1 = 'Random Forest Classfier'
rf = RandomForestClassifier(n_estimators=20, max_depth=5)
rf.fit(X_train,y_train)
rf_predicted = rf.predict(X_test)
rf_conf_matrix = confusion_matrix(y_test, rf_predicted)
rf_acc_score = accuracy_score(y_test, rf_predicted)
print("confussion matrix")
print(rf_conf_matrix)
print("\n")
print("Accuracy of Random Forest:",rf_acc_score*100,'\n')
print(classification_report(y_test,rf_predicted))

kfold = KFold(n_splits=10, random_state=None)
cv_results = cross_val_score(rf, X_train, y_train, cv=kfold, scoring='accuracy')
msg = "%s: %f (%f)" % (m1, cv_results.mean(), cv_results.std())
print(msg)

##*2. LOGISTIC REGRESSION*

In [None]:
## LOGISTIC REGRESSION

m2 = 'Logistic Regression'
lr = LogisticRegression()
model = lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)
print("confussion matrix")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Logistic Regression:",lr_acc_score*100,'\n')
print(classification_report(y_test,lr_predict))

kfold = KFold(n_splits=10, random_state=None)
cv_results = cross_val_score(lr, X_train, y_train, cv=kfold, scoring='accuracy')
msg = "%s: %f (%f)" % (m2, cv_results.mean(), cv_results.std())
print(msg)

## *3. NAIVE BAYES*

In [None]:
m3 = 'Naive Bayes'
nb = GaussianNB()
nb.fit(X_train,y_train)
nbpred = nb.predict(X_test)
nb_conf_matrix = confusion_matrix(y_test, nbpred)
nb_acc_score = accuracy_score(y_test, nbpred)
print("confussion matrix")
print(nb_conf_matrix)
print("\n")
print("Accuracy of Naive Bayes model:",nb_acc_score*100,'\n')
print(classification_report(y_test,nbpred))

kfold = KFold(n_splits=10, random_state=None)
cv_results = cross_val_score(nb, X_train, y_train, cv=kfold, scoring='accuracy')
msg = "%s: %f (%f)" % (m3, cv_results.mean(), cv_results.std())
print(msg)

## *4. EXTREME GB*

In [None]:
## XGBOOSTING

m4 = 'Extreme Gradient Boost'
xgb = XGBClassifier(learning_rate=0.01, n_estimators=25, max_depth=15,gamma=0.6, subsample=0.52,colsample_bytree=0.6,seed=27, 
                    reg_lambda=2, booster='dart', colsample_bylevel=0.6, colsample_bynode=0.5)
xgb.fit(X_train, y_train)
xgb_predicted = xgb.predict(X_test)
xgb_conf_matrix = confusion_matrix(y_test, xgb_predicted)
xgb_acc_score = accuracy_score(y_test, xgb_predicted)
print("confussion matrix")
print(xgb_conf_matrix)
print("\n")
print("Accuracy of Extreme Gradient Boost:",xgb_acc_score*100,'\n')
print(classification_report(y_test,xgb_predicted))

kfold = KFold(n_splits=10, random_state=None)
cv_results = cross_val_score(xgb, X_train, y_train, cv=kfold, scoring='accuracy')
msg = "%s: %f (%f)" % (m4, cv_results.mean(), cv_results.std())
print(msg)

## *5. KNN*

In [None]:
## KNN

m5 = 'K-NeighborsClassifier'
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_predicted = knn.predict(X_test)
knn_conf_matrix = confusion_matrix(y_test, knn_predicted)
knn_acc_score = accuracy_score(y_test, knn_predicted)
print("confussion matrix")
print(knn_conf_matrix)
print("\n")
print("Accuracy of K-NeighborsClassifier:",knn_acc_score*100,'\n')
print(classification_report(y_test,knn_predicted))

kfold = KFold(n_splits=10, random_state=None)
cv_results = cross_val_score(knn, X_train, y_train, cv=kfold, scoring='accuracy')
msg = "%s: %f (%f)" % (m5, cv_results.mean(), cv_results.std())
print(msg)

## *6. DECISION TREE*

In [None]:
## DECISION TREE

m6 = 'DecisionTreeClassifier'
dt = DecisionTreeClassifier(criterion = 'entropy',random_state=None,max_depth = 6)
dt.fit(X_train, y_train)
dt_predicted = dt.predict(X_test)
dt_conf_matrix = confusion_matrix(y_test, dt_predicted)
dt_acc_score = accuracy_score(y_test, dt_predicted)
print("confussion matrix")
print(dt_conf_matrix)
print("\n")
print("Accuracy of DecisionTreeClassifier:",dt_acc_score*100,'\n')
print(classification_report(y_test,dt_predicted))

kfold = KFold(n_splits=10, random_state=None)
cv_results = cross_val_score(dt, X_train, y_train, cv=kfold, scoring='accuracy')
msg = "%s: %f (%f)" % (m6, cv_results.mean(), cv_results.std())
print(msg)

## *7. SVM*

In [None]:
m7 = 'Support Vector Classifier'
svc =  SVC(kernel='rbf', C=2)
svc.fit(X_train, y_train)
svc_predicted = svc.predict(X_test)
svc_conf_matrix = confusion_matrix(y_test, svc_predicted)
svc_acc_score = accuracy_score(y_test, svc_predicted)
print("confussion matrix")
print(svc_conf_matrix)
print("\n")
print("Accuracy of Support Vector Classifier:",svc_acc_score*100,'\n')
print(classification_report(y_test,svc_predicted))

kfold = KFold(n_splits=10, random_state=None)
cv_results = cross_val_score(svc, X_train, y_train, cv=kfold, scoring='accuracy')
msg = "%s: %f (%f)" % (m7, cv_results.mean(), cv_results.std())
print(msg)

## *ENSEMBLING*

In [None]:
from mlxtend.classifier import StackingCVClassifier

In [None]:
scv=StackingCVClassifier(classifiers=[lr,knn,rf],
                         meta_classifier= svc)

scv.fit(np.asarray(X_train),np.asarray(y_train))
scv_predicted = scv.predict(X_test)
scv_conf_matrix = confusion_matrix(y_test, scv_predicted)
scv_acc_score = accuracy_score(y_test, scv_predicted)
print("confussion matrix")
print(scv_conf_matrix)
print("\n")
print("Accuracy of StackingCVClassifier:",scv_acc_score*100,'\n')
print(classification_report(y_test,scv_predicted))

kfold = KFold(n_splits=10)
cv_results = cross_val_score(svc, X_train, y_train, cv=kfold, scoring='accuracy')
msg = "%s: %f (%f)" % ('Stacking CV Cassifier', cv_results.mean(), cv_results.std())
print(msg)