In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('../input/income-classification/income_evaluation.csv')
df.columns.tolist()
print(df.shape)
df.head()

In [None]:
df.info()

In [None]:
# In columns names there is space at the start of columns so, remove this
df.columns=df.columns.str.replace(' ','')
df.columns

In [None]:
df.workclass.unique()

In [None]:
df.education.unique().tolist()

In [None]:
df['education-num'].unique()

In [None]:
df['marital-status'].unique()

In [None]:
df.occupation.unique()

In [None]:
income_data=df['income'].value_counts().reset_index()
income_data.columns=['income','frequency']

plt.bar(income_data['income'],income_data['frequency'],color=['red','pink']);
# imblance classes, huge difference

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
sns.boxplot(x=df['income'],y=df['fnlwgt']);
# There are outliers

### Let's prepare the data for model

In [None]:
# X=df[['age','fnlwgt','education-num','sex','capital-gain','capital-loss','hours-per-week']]
# # x.info()
# X['sex']=X['sex'].astype("category")

# here is the 'sex' column which is categorical so, we have to label this column...

There is two main labelling concepts

one is **label-encoding** and other is **one-hot-encoding**

**Remember perform encoding after train test split**

##### If you have some kind of order and have more categories then you use label-encoding
##### If you don't have any order and have comparitively less categories then you should 
##### choose one-hot-encoding
**There is different methods for both encodings in python**


### label-encoding method # 01

In [None]:
##### x.info()
##### x['sex'] = x['sex'].cat.codes
##### x['sex']

### label-encoding method # 02 (using sklearn)

In [None]:
# from sklearn.preprocessing import LabelEncoder
# labelencoder=LabelEncoder()
# x['sex'] = labelencoder.fit_transform(x['sex'])
# x['sex'][1:30]

### one-hot-encoding method # 01 (using sklearn)

In [None]:
# OneHotEncoder from SciKit library only takes numerical categorical values

# x['sex'] = x['sex'].cat.codes
# from sklearn.preprocessing import OneHotEncoder
# enc = OneHotEncoder(handle_unknown='ignore')
# enc_df=pd.DataFrame(enc.fit_transform(df[['sex']]).toarray())
# df = df.join(enc_df)
# df.head()

### one-hot-encoding method # 02 (using get_dummies methods)

In [None]:
# x1 = pd.DataFrame(x['sex'], columns=['sex'])
# # generate binary values using get_dummies
# dum_df = pd.get_dummies(x1, columns=["sex"], prefix=["Type_is"] )
# # merge with main df bridge_df on key values
# x = x.join(dum_df)
# x

some algorithm preform well on categorical variable like decision tree but some 
required to convert categorical to numerical form

In [None]:
# prepare data for model

X=df[['age','fnlwgt','education-num','sex','capital-gain','capital-loss','hours-per-week']]
X['sex']=X['sex'].astype("category")
Y=df['income']
X_train,X_test,y_train,y_test=train_test_split(X,Y,random_state=1,train_size=0.75)

labelencoder=LabelEncoder()

X_train['sex'] = labelencoder.fit_transform(X_train['sex'])
X_test['sex'] = labelencoder.transform(X_test['sex'])

y_train=labelencoder.fit_transform(y_train)
y_test=labelencoder.transform(y_test)

In [None]:
fig,ax=plt.subplots(1,2,figsize=(10, 3))
sns.countplot(X_train['sex'],label='count',ax=ax[0])
ax[0].set_title('Train data')

sns.countplot(X_test['sex'],label='count',ax=ax[1])
ax[1].set_title('test data')
plt.show()

In [None]:
fig,ax=plt.subplots(1,2,figsize=(10, 3))
sns.countplot(y_train,label='count',ax=ax[0])
ax[0].set_title('y_train')
sns.countplot(y_test,label='count',ax=ax[1])
ax[1].set_title('y_test')
plt.show()

### LogisticRegression

In [None]:
lr=LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred=lr.predict(X_test)

In [None]:
cm=confusion_matrix(y_test,y_pred)
print('Confusion Matrix : \n',cm)
sns.heatmap(cm,annot=True);
print('\n')
print('classification_report : \n',classification_report(y_test,y_pred))

### Recall or True Positive Rate (TPR) or Sensitivity or Probability of detection
##### When it's actually yes, how often does it predict yes?
##### Recall=TP/TP+FN(actual)

### precision
##### When it predicts yes, how often is it correct? or what fraction of positive prediction 
##### are correct
##### TP/TP+FP(predicted)

### True Negative Rate (TNR) or Specificity
##### When it's actually no, how often does it predict no?
##### TNR=TN/TN+FP

### False Positive Rate (FPR) or 1 - specificity
##### When it's actually no, how often does it predict yes? or what fraction of all negetive
##### instance does the classifier incorrectly indentify as positive
##### FPR=FP/TN+FP

### Missclassification Rate
##### Overall how often is it wrong (overall incorrect prediction)
##### FP+FN/total

### F1-score
##### It is difficult to compare two models with low precision and high recall or vice versa. 
##### Diffcult to distiguish between precision and recall for the project.
##### So to make them comparable, we use F1-Score. F1-score helps to measure Recall and Precision 
##### at the same time. It uses Harmonic Mean in place of Arithmetic Mean by punishing the extreme 
##### values more.It is maximum when Precision is equal to Recall.
##### F1=2*((precision*recall)/(precision+recall))

In [None]:
print('Accuracy: ',round(accuracy_score(y_test,y_pred)*100,2)) # TP+TN/total is accuracy
# print((cm[0,0]+cm[1,1])/cm.sum())

# Missclassification Rate: Overall how often is it wrong (overall incorrect prediction)
# FP+FN/total
print('Missclassification Rate: ',round((cm[0,1]+cm[1,0])/cm.sum()*100,2))

In [None]:
print('Recall: ',round((cm[1,1]/(cm[1,1]+cm[1,0]))*100,2)) # Recall=TP/TP+FN(actual)

print('Precision: ',round((cm[1,1]/(cm[1,1]+cm[0,1]))*100,2)) # TP/TP+FP(predicted)

In [None]:
pd.Series(y_pred).value_counts() # predicted by the model 0 and 1 sample

In [None]:
pd.Series(y_test).value_counts() # test 0 and 1 sample

In [None]:
pd.Series(y_test).value_counts().sum() # total sample

### SVM

In [None]:
svclassifier=SVC(kernel='rbf')

In [None]:
svclassifier.fit(X_train, y_train)

In [None]:
y_pred=svclassifier.predict(X_test)

In [None]:
cm=confusion_matrix(y_test,y_pred)
print('confusion_matrix:\n',cm)

In [None]:
print('Accuracy:',round(accuracy_score(y_test,y_pred)*100,2))

### Cross Validation

In [None]:
X=df[['age','fnlwgt','education-num','sex','capital-gain','capital-loss','hours-per-week']]
labelencoder=LabelEncoder()
X['sex']=labelencoder.fit_transform(X['sex'])
Y=df['income']
Y=labelencoder.fit_transform(Y)

In [None]:
kfold_val=KFold(10)
svclassifier=SVC(kernel='rbf')
cross_val_result=cross_val_score(svclassifier,X,Y,cv=kfold_val)
cross_val_result

In [None]:
print('average accuracy : ',np.mean(cross_val_result))
print('min. accuracy : ',cross_val_result.min())
print('max. accuracy : ',cross_val_result.max())

In [None]:
stratified=StratifiedKFold(n_splits=5) # it is use when data contain imbalanced classes
cross_val_result=cross_val_score(svclassifier,X,Y,cv=stratified)
cross_val_result

In [None]:
print('average accuracy : ',np.mean(cross_val_result))
print('min. accuracy : ',cross_val_result.min())
print('max. accuracy : ',cross_val_result.max())

### Normalization and GridSearch 

In [None]:
scaler=MinMaxScaler()
Scaled_X_train=scaler.fit_transform(X_train)
Scaled_X_test=scaler.transform(X_test)

grid_param={'C':[0.5,1.0,10.0,100.0],'kernel':['rbf','sigmoid']}
gridsvclassifier=GridSearchCV(SVC(),grid_param)
gridsvclassifier.fit(Scaled_X_train, y_train)
y_pred=gridsvclassifier.predict(Scaled_X_test)
cm=confusion_matrix(y_test,y_pred)

In [None]:
print('confusion_matrix:\n',cm)
print('\nAccuracy:',round(accuracy_score(y_test,y_pred)*100,2))
print('\nBest Parameters:',gridsvclassifier.best_params_)
print('\nBest Estimator:',gridsvclassifier.best_estimator_)
print('\nBest Score:',gridsvclassifier.best_score_)
print('\nBest Index:',gridsvclassifier.best_index_)

### Decision Tree

In [None]:
treeclr=DecisionTreeClassifier()
treeclr.fit(X_train, y_train)

In [None]:
y_pred=treeclr.predict(X_test)
tree_cm=confusion_matrix(y_test,y_pred)
print('confusion_matrix:\n',tree_cm)
print('\naccuracy:',round(accuracy_score(y_test,y_pred)*100,2))

### Decision Tree with Normalize data

In [None]:
treeclr_scaled=DecisionTreeClassifier()
treeclr_scaled.fit(Scaled_X_train, y_train)

In [None]:
y_pred1=treeclr_scaled.predict(Scaled_X_test)
tree_cm1=confusion_matrix(y_test,y_pred1)
print('confusion_matrix:\n',tree_cm1)
print('\nAccuracy:',round(accuracy_score(y_test,y_pred1)*100,2))

### Random Forest

In [None]:
model=RandomForestClassifier()
model.fit(X_train,y_train)
y_pred1=model.predict(X_test)

In [None]:
cm2=confusion_matrix(y_test,y_pred1)
print('confusion_matrix:\n',cm2)
print('\naccuracy:',round(accuracy_score(y_test,y_pred1)*100,2))

In [None]:
importance_value=np.round((model.feature_importances_)*100,2)
# print(importance_value)
# print('\n')
# for c,i in zip(X_train.columns,importance_features):
#     print('Feature is {0} and importance is {1}'.format(c,i))
importance_features=[(feature,importance1) for feature,importance1 in zip(X_train.columns,importance_value)]
importance_features=sorted(importance_features,key=lambda x: x[1],reverse=True) 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in importance_features];

In [None]:
# use magic command for Jupyter Notebooks
%matplotlib inline
# Set the style
plt.style.use('fivethirtyeight')
# list of x locations for plotting
x_values = list(range(len(importance_value)))
# Make a bar chart
plt.bar(x_values, importance_value, orientation = 'vertical')
# Tick labels for x axis
plt.xticks(x_values, X_train.columns, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');

In [None]:
# it can also be done
# converting into DataFrame
importance_data=pd.DataFrame(importance_features,columns=['features','importance'])
importance_data

# plotting
plt.figure(figsize=(8,5))
plt.barh(importance_data['features'],importance_data['importance'])
plt.gca().invert_yaxis()
plt.show()

So, In different algorithms Random Forest and Support Vector Machine with rbf kernel and C=100.0 perform well.