In [None]:

# #### Project Objective:
# * Project Objective
# * The goal of this project is to give you experience in a real-life classification task, which is one of the most practical and commonly encountered type of data-related task. It will give you experience applying approaches such as exploratory data analysis, visualization, preprocessing, and how to select and evaluate various classification approaches.
# * The task is to predict whether a person’s income exceeds 50,000 per year based on some demographic data, such as age, education, occupation, etc.
# 
# 
# * test.csv 24421 row of entries
# train.csv 24421 row of entries
# 
# Most of entries in test.csv contain in train.csv. train.csv has one more column--  exceeds 50K.
# 
# Each entry contains the following information about an individual(train.csv):
# 
# age: the age of an individual (age number greater than 0)
# workclass:  the employment status of a row
# Private, Self­emp­not­inc, Self­emp­inc, Federal­gov, Local­gov, State­gov, Without­pay, Never­worked.
# fnlwgt: final weight.  Number of people the data believes the entry represents.
# (number greater than 0)
# 
# 
# education: the highest level of education achieved by someone.
# Bachelors, Some­college, 11th, HS­grad, Prof­school, Assoc­acdm, Assoc­voc, 9th, 7th­8th, 12th, Masters, 1st­4th, 10th, Doctorate, 5th­6th, Preschool.
# 
# education­num: the highest level of education achieved in numerical form.
# (number greater than 0)
# 
# marital­status: marital status of an individual. Married­civ­spouse corresponds to a civilian spouse while Married­AF­spouse is a spouse in the Armed Forces.
# Married­civ­spouse, Divorced, Never­married, Separated, Widowed, Married­spouse­absent, Married­AF­spouse.
# 
# occupation: the general type of occupation of someone
# Tech­support, Craft­repair, Other­service, Sales, Exec­managerial, Prof­specialty, Handlers­cleaners, Machine­op­inspct, Adm­clerical, Farming­fishing, Transport­moving, Priv­house­serv, Protective­serv, Armed­Forces.
# 
# relationship: represents what this someone is relative to others. For example, someone could be a Husband. Each entry only has one relationship attribute and is somewhat redundant with marital status. We might not make use of this attribute at all
# Wife, Own­child, Husband, Not­in­family, Other­relative, Unmarried.
# 
# race: descriptions of someone’s race
# White, Asian­Pac­Islander, Amer­Indian­Eskimo, Other, Black.
# 
# sex: Male, Female
# 
# capital­gain: capital gains for an individual (number greater than or equal to 0)
# 
# capital­loss: capital loss for an individual (number greater than or equal to 0)
# 
# hours­per­week: the hours of someon has reported to work per week continuous.
# 
# native­country: country of origin for someone
# United­States, Cambodia, England, Puerto­Rico, Canada, Germany, Outlying­US(Guam­USVI­etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican­Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El­Salvador, Trinadad&Tobago, Peru, Hong, Holand­Netherlands.
# 
# the label: whether or not someone makes more than $50,000 annually. <=50k, >50k
# 

# ## Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, train_test_split, KFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

sns.set(style='white', context='notebook', palette='deep')

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [None]:
dataset=pd.read_csv('../input/finalproject/train.csv')
dataset_test=pd.read_csv('../input/finalproject/test.csv')

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
# define Numeric features
numeric_features = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week','exceeds50K']
#define non-Numeric features
cat_features = ['workclass','education','marital-status', 'occupation', 'relationship', 'sex', 'native-country']

### age vs exceeds50k

In [None]:
sns.countplot(dataset['exceeds50K'],label="Count")
plt.show()

In [None]:
g = sns.heatmap(dataset[numeric_features].corr(),annot=True, fmt = ".2f", cmap = "coolwarm")
plt.show()

In [None]:
# Explore Education Num vs Income
g = sns.factorplot(x="education-num",y="exceeds50K",data=dataset,kind="bar",size = 6,palette = "muted")
g.despine(left=True)
g = g.set_ylabels(">50K probability")
plt.show()

In [None]:
# Explore Hours Per Week vs Income
g  = sns.factorplot(x="hours-per-week",y="exceeds50K",data=dataset,kind="bar",size = 20,palette = "muted")
g.despine(left=True)
g = g.set_ylabels(">50K probability")
plt.show()

In [None]:
# Explore Age vs Income
g = sns.FacetGrid(dataset, col='exceeds50K')
g = g.map(sns.distplot, "age")
plt.show()

In [None]:
# Explore Native Nation vs Income
g  = sns.factorplot(x="native-country",y="exceeds50K",data=dataset,kind="bar",size = 40,palette = "muted")
g.despine(left=True)
g = g.set_ylabels(">50K probability")

#g = sns.barplot(x="native-country",y="exceeds50K",data=dataset)
#g = g.set_ylabel("Income >50K Probability")
plt.show()

In [None]:
# Explore Sex vs Income
g = sns.barplot(x="sex",y="exceeds50K",data=dataset)
g = g.set_ylabel("Income >50K Probability")
plt.show()

In [None]:
# Explore Relationship vs Income
g = sns.factorplot(x="relationship",y="exceeds50K",data=dataset,kind="bar", size = 6 ,
palette = "muted")
g.despine(left=True)
g = g.set_ylabels("Income >50K Probability")
plt.show()

In [None]:
# Explore Marital Status vs Income
g = sns.factorplot(x="marital-status",y="exceeds50K",data=dataset,kind="bar", size = 10 ,
palette = "muted")
g.despine(left=True)
g = g.set_ylabels("Income >50K Probability")
plt.show()

In [None]:
# Explore Workclass vs Income
g = sns.factorplot(x="workclass",y="exceeds50K",data=dataset,kind="bar", size = 10 ,
palette = "muted")
g.despine(left=True)
g = g.set_ylabels("Income >50K Probability")
plt.show()

In [None]:
x_train = pd.get_dummies(dataset)
x_train.head(5)

In [None]:
y_train = dataset[['exceeds50K']]
y_train.head(5)

In [None]:
clf = RandomForestClassifier(random_state = 10, max_features='sqrt')
pipe = Pipeline([('classify', clf)])
param = {'classify__n_estimators':list(range(20, 30, 1)),
              'classify__max_depth':list(range(3, 10, 1))}
grid = GridSearchCV(estimator = pipe, param_grid = param, scoring = 'accuracy', cv = 10)
grid.fit(x_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

In [None]:
x_test = pd.get_dummies(dataset_test)

In [None]:
y_test = grid.predict(x_test)
submission = pd.DataFrame({'id': id, 'prediction': y_test.astype(np.int32)})
submission.to_csv('submission.csv', index=False)