In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


import warnings
warnings.filterwarnings('ignore')

In [None]:
# Reading the train csv file.

income_train=pd.read_csv('income_data_train.csv')
income_train

In [None]:
income_train.shape

In [None]:
# Reading the test csv file.
income_test=pd.read_csv('income_data_test.csv')
income_test

In [None]:
income_test.shape

In [None]:
income_test.info()

# Creating income dataset.
1. Since both income train and test data are having same dimensions, creating single dataset for it.
2. Also some of the columns having ? as well as spaces and . characters which needs clean-up.
3. Many of the required columns are of type objects which require transformation.

In [None]:
# Concat train and test dataframes
data=pd.concat([income_train,income_test], axis=0)

In [None]:
data.shape

In [None]:
#Trimming spacing in the data from start and end of string across all the data in dataframe.
data.replace('^\s+', '', regex=True, inplace=True) #front
data.replace('\s+$', '', regex=True, inplace=True) #end


# Replacing . in the data['income'] column.
data['income'].replace(to_replace="<=50K.",value ="<=50K", inplace=True)
data['income'].replace(to_replace=">50K.",value =">50K", inplace=True)

#Replacing the value ? with other values.
data['workclass'].replace(to_replace ="?",value ='State-gov', inplace=True)
data['occupation'].replace(to_replace ="?",value ='Tech-support', inplace=True)
data['native-country'].replace(to_replace ="?",value ="India", inplace=True)


In [None]:
data.info()

In [None]:
#count plot on workclass 

plt.figure(figsize=(12,10))
sns.countplot(x='workclass', hue='income',data=data)
#sns.countplot(x='workclass',data=data)

In [None]:
#count plot on occupation
plt.figure(figsize=(25,5))
sns.countplot(x='occupation', hue='income',data=data)

In [None]:
# Encoding columns as part of transformation.

from sklearn.preprocessing import LabelEncoder 
le= LabelEncoder()

data['workclass'] = le.fit_transform(data['workclass'])
data['education'] = le.fit_transform(data['education'])
data['marital-status'] = le.fit_transform(data['marital-status'])

data['occupation'] = le.fit_transform(data['occupation'])
data['relationship'] = le.fit_transform(data['relationship'])

data['race'] = le.fit_transform(data['race'])
data['sex'] = le.fit_transform(data['sex'])
data['native-country'] = le.fit_transform(data['native-country'])
data['income'] = le.fit_transform(data['income'])

In [None]:
data.info()

In [None]:
# Checking the zscore in order to normalize the data.

from scipy.stats import zscore
z= np.abs(zscore(data))
z

In [None]:
# Verifying for the presence of zscore value of data with threshold of more than 3 std score.

threshold=3
print(np.where(z>3))

In [None]:
# Removing the outliers having zscore value of more than 3.
income=data[(z<3).all(axis=1)]
income

Removed  records from the original data of  set where zscore was more than 3 and created a new set.

In [None]:
plt.figure(figsize=(20,10)) 
sns.heatmap(income.corr(), annot=True, fmt=".2f") 
plt.suptitle("Correlation Map", fontsize=18)
plt.show()  

In [None]:
# Checking for skewed data
income.skew()

In [None]:
# Reducing the skewness with boxcox1p in order to avoid 0 encountered as negative while transformation.

from scipy.special import boxcox1p
# 0 -> log transform
# .5 -> square root transform


income['capital-gain']=boxcox1p(income['capital-gain'],0.5)
income['capital-loss']=boxcox1p(income['capital-loss'],0.5)
income['income']=boxcox1p(income['income'],0.5)

In [None]:
income.skew()

In [None]:
# Setting x  by excluding income column which is y here for prediction.
x=data.drop(['income'],axis=1)
x

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scale = StandardScaler()
#scale = MinMaxScaler()

x=scale.fit_transform(x)

In [None]:
# Settng Y
y=data['income']
y

In [None]:
max_acc_score=0
for r_state in range(42,101):
    train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=.21,random_state=r_state)
    lg=LogisticRegression()
    lg.fit(train_x,train_y)
    pred=lg.predict(test_x)
    accuracyScore=accuracy_score(test_y,pred)
    #print("Accuracy_Score corresponding to r_state: ",r_state," is ",accuracyScore)
    if(accuracyScore>max_acc_score):
        max_acc_score=accuracyScore
        final_rstate=r_state
        
print("\n\n")
print("Max_accuracy_Score corresponding to final_r_state: ",final_rstate," is ",max_acc_score)

In [None]:
# Setting the test x & y values and using the random state from above step which is 74.

train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=.21,random_state=74)

In [None]:
train_x.shape

In [None]:
train_y.shape

In [None]:
test_x.shape

In [None]:
test_y.shape

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
LR=LogisticRegression()
DT=DecisionTreeClassifier()
GNB=GaussianNB()
RFC=RandomForestClassifier(n_estimators=100,random_state=100)
ADC=AdaBoostClassifier(n_estimators=500,random_state=10)
GBC=GradientBoostingClassifier(n_estimators=500,random_state=10)

In [None]:
models=[]
models.append(('LogisticRegression',LR))
models.append(('DecisionTreeClassifier',DT))
models.append(('GaussianNB',GNB))
models.append(('RandomForestClassifier',RFC))
models.append(('AdaBoostClassifier',ADC))
models.append(('GradientBoostingClassifier',GBC))

In [None]:
Model=[]
score=[]
cvs=[]
rocscore=[]

for name,model in models:
    print("--------------",name,"--------------")
    Model.append(name)
    model.fit(train_x,train_y)
    print(model)
    pre=model.predict(test_x)
    AS=accuracy_score(test_y,pre)
    print("Accuracy Score: ", AS)
    score.append(AS*100)
    sc=cross_val_score(model,x,y,cv=10,scoring='accuracy').mean()
    print("Cross_Val_Score: ", sc)
    cvs.append(sc*100)
    cm=confusion_matrix(test_y,pre)
    print(cm)
    print("\n")

In [None]:
result=pd.DataFrame({"Model": Model, "Score": score, "Cross Val Score":cvs})
result

# From the above table GradientBoostingRegressor seems to be the best model with over 87% accuracy.

In [None]:
# Saving the prediction data in a file.

predictData=pd.DataFrame(pre)
data.to_csv('income_Predict.csv')
predictData

In [None]:
# Saving the model

from sklearn.externals import joblib
joblib.dump(GBC,"GBC_Income.pkl")