## United State Census(INCOME PREDICTION)

Aim is to build a predictive model to determine the income level for people in US. The income levels are binned at below 50K      and above 50K.

In [None]:
#Importing the required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv("adult.csv")
df

### DATA CLEANING

In [None]:
a=df.replace("?",np.nan) #replacing the miscoded values
a

In [None]:
a.isnull().sum() #checking the dataset for null values .... null values in three columns

In [None]:
a.mode() #finding mode to calculate the most occuring item in every column

In [None]:
a["workclass"]=a["workclass"].fillna("Private")
a["occupation"]=a["occupation"].fillna("Prof-specialty")
a["native-country"]=a["native-country"].fillna("United-States")
a  #.......filling the miscoded valued column with their mode

In [None]:
a.isnull().sum() #null values removed
#DATA SET CLEANED

In [None]:
a.columns #columns names in dataset

In [None]:
a.dtypes #data type of every column

### Visualization 

In [None]:
#separating the categorical and numerical data

In [None]:
categorical = a.select_dtypes(include = ["object"]).keys()
print(categorical)

In [None]:
quantitative = a.select_dtypes(include = ["int64"]).keys()
print(quantitative)

In [None]:
a[quantitative].describe() #performing statistics

## plots of categorical data

In [None]:
fig = plt.figure(figsize=(10,6))

sns.countplot('workclass', hue='income', data=a)
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(10,6))

sns.countplot('workclass', hue='income', data=a)
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(20,6))

sns.countplot('education', hue='income', data=a)
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(20,6))

sns.countplot('marital-status', hue='income', data=a)
plt.tight_layout()
plt.show()


In [None]:
fig = plt.figure(figsize=(20,6))

sns.countplot('occupation', hue='income', data=a)
plt.tight_layout()
plt.show()


In [None]:
fig = plt.figure(figsize=(20,6))

sns.countplot('relationship', hue='income', data=a)
plt.tight_layout()
plt.show()


In [None]:
fig = plt.figure(figsize=(20,6))

sns.countplot('race', hue='income', data=a)
plt.tight_layout()
plt.show()


In [None]:
fig = plt.figure(figsize=(20,6))

sns.countplot('gender', hue='income', data=a)
plt.tight_layout()
plt.show()


In [None]:
fig = plt.figure(figsize=(20,6))

sns.countplot('native-country', hue='income', data=a)
plt.tight_layout()
plt.show()


In [None]:
def income_to_numeric(x):
    if x=='>50K':
        return 1
    if x=='<=50K':
        return 0
    
a['income'] = a['income'].apply(income_to_numeric)
def age_to_numeric(x):
    if x=='Male':
        return 1
    if x=='Female':
        return 0
    
a['gender'] = a['gender'].apply(age_to_numeric)
a    

## EDA

In [None]:
sns.boxplot(y='hours-per-week',x='income',data=a)

In [None]:
sns.boxplot(y='age',x='income',data=a)

In [None]:
sns.boxplot(y='educational-num',x='income',data=a)

In [None]:
sns.boxplot(y='fnlwgt',x='income',data=a)

In [None]:
sns.boxplot(y='capital-gain',x='income',data=a)

In [None]:
sns.boxplot(y='capital-loss',x='income',data=a)

In [None]:
sns.boxplot(y=a['gender'],x='income',data=a)

In [None]:
sns.barplot(y='income',x='relationship',data=a)

In [None]:
plt.scatter(a["native-country"],a["income"],c="g") 
plt.ylabel("income")
plt.xlabel("native-country")

In [None]:
sns.stripplot(y="income",x="race",data=a)

In [None]:
dums=pd.get_dummies(a, columns=["workclass", "education", "marital-status", "occupation"])
dums

In [None]:
dums=dums.drop(columns=["relationship","race","native-country"])

## After performing EDA we came to a conclusion that there are three columns which are not useful for the prediction and removal of these columns will not affect the prediction. We also converted required categorical values into numerical ones so that it will be easy to correlate between them


### Correlations between features

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 8, 5
sns.stripplot(dums['educational-num'],dums['age'],hue=dums['income'],data=dums)

In [None]:
sns.jointplot(dums['hours-per-week'],dums['age'],kind='kde',color='y',data=dums)

In [None]:
corr = dums.corr()
plt.figure(figsize=(18,12))
sns.heatmap(corr,cmap='Blues',annot=True)

In [None]:
X = dums.drop(['income'],axis=1)    
Y = dums['income']


In [None]:
X = (X-X.min())/(X.max()-X.min())
X.head()

In [None]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(0.005)
X = selector.fit_transform(X)
X.shape,Y.shape

### Applying various classifiers and comparing it
### PCA

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
x_Std=StandardScaler().fit_transform(X)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=7)

In [None]:
principalComponents = pca.fit_transform(x_Std)

In [None]:
principalDf = pd.DataFrame(data = principalComponents)

In [None]:
principalDf

In [None]:
from sklearn.model_selection import train_test_split
principalDf_train,principalDf_test,Y_train,Y_test = train_test_split(principalDf,Y,test_size = 0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(principalDf_train,Y_train)

In [None]:
pred=model.predict(principalDf_test)
from sklearn import metrics
pca_acc=metrics.accuracy_score(Y_test,pred)*100
pca_acc

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
reg_lr = LogisticRegression(random_state=5)
reg_lr.fit(principalDf_train,Y_train)

In [None]:
pred_lr=reg_lr.predict(principalDf_test)
from sklearn import metrics
lr_acc=metrics.accuracy_score(Y_test,pred_lr)*100
lr_acc

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model_KNN = KNeighborsClassifier(n_neighbors=15)
model_KNN.fit(principalDf_train,Y_train)

In [None]:
pred_KNN = model_KNN.predict(principalDf_test)
knn_acc = model_KNN.score(principalDf_test,Y_test)*100
knn_acc

### Naive bayes
#### GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB
model_gnb=GaussianNB()
model_gnb.fit(principalDf_train,Y_train)

In [None]:
pred_gnb = model_gnb.predict(principalDf_test)
gnb_acc = metrics.accuracy_score(Y_test,pred_gnb)*100
gnb_acc

## BernoulliNB

In [None]:
from sklearn.naive_bayes import BernoulliNB
model_bnb=BernoulliNB()
model_bnb.fit(principalDf_train,Y_train)

In [None]:
pred_bnb = model_bnb.predict(principalDf_test)
bnb_acc = metrics.accuracy_score(Y_test,pred_bnb)*100
bnb_acc

## SVM

In [None]:
from sklearn.svm import SVC
model_svc=SVC()
model_svc.fit(principalDf_train,Y_train)

In [None]:
pred_svc = model_svc.predict(principalDf_test)
svc_acc = metrics.accuracy_score(Y_test,pred_svc)*100
svc_acc

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
model_tree= DecisionTreeClassifier() #max_leaf_nodes=12 #random_state=1
model_tree.fit(principalDf_train,Y_train)

In [None]:
pred_tree = model_tree.predict(principalDf_test)
tree_acc=metrics.accuracy_score(Y_test,pred_tree)*100
tree_acc

## Entropy

In [None]:
model_tree1 = DecisionTreeClassifier(criterion="entropy") #max_leaf_nodes=12 #random_state=1
model_tree1.fit(principalDf_train,Y_train)


In [None]:
pred_tree1 = model_tree1.predict(principalDf_test)
tree1_acc= metrics.accuracy_score(Y_test,pred_tree1)*100
tree1_acc

### RandomForest

In [None]:
b = RandomForestClassifier(max_leaf_nodes=14)
b.fit(principalDf_train,Y_train)
b_pred = b.predict(principalDf_test)
bacc=metrics.accuracy_score(Y_test,b_pred)*100
bacc

#### Entropy

In [None]:
b1 = RandomForestClassifier(criterion="entropy",max_leaf_nodes=14)
b1.fit(principalDf_train,Y_train)
b1_pred = b1.predict(principalDf_test)
b1acc=metrics.accuracy_score(Y_test,b1_pred)*100
b1acc

### Ensemble methods
#### BaggingClassifier

In [None]:
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(max_samples=0.5,max_features=0.5)
bagging.fit(principalDf_train,Y_train)
pred_E_BC = bagging.predict(principalDf_test)
bc=metrics.accuracy_score(Y_test,pred_E_BC)*100
bc

#### GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
model_E_GBC = GradientBoostingClassifier(n_estimators=200,learning_rate=.02)
model_E_GBC.fit(principalDf_train,Y_train)
pred_E_GBC = model_E_GBC.predict(principalDf_test)
gbcacc = metrics.accuracy_score(Y_test,pred_E_GBC)*100
gbcacc

#### VotingClassifier

In [None]:
from sklearn.ensemble import VotingClassifier
model1 = DecisionTreeClassifier()
model2 = LogisticRegression()
model3 = SVC()
model_E_VC = VotingClassifier(estimators=[('DT',model1),('LR',model2),('SVC',model3)])
model_E_VC.fit(principalDf_train,Y_train)
pred_E_VC = model_E_VC.predict(principalDf_test)
vc=metrics.accuracy_score(Y_test,pred_E_VC)*100
vc

In [None]:
accuracyScore = [pca_acc,lr_acc,knn_acc,gnb_acc,bnb_acc,svc_acc,tree_acc,tree1_acc,bacc,b1acc,bc,gbcacc,vc]
algoName = ['PCA', 'LR', 'KNN' , 'GNB', 'BNB' , 'SVM' , 'DT' , 'EDT', 'RF' , 'ERF','BC','GBC','VC']

In [None]:
plt.scatter(algoName, accuracyScore)
plt.grid()
plt.title('Algorithm Accuracy Comparision')
plt.xlabel('Algorithm')
plt.ylabel('Score in %')
plt.show()

## Conclusion: so, we get a maximum of 83.28 % accuracy via SVM Model

