# Data Analysis And Machine Learning using Campus Placement Dataset

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set() 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
my_filepath = "../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv"

In [None]:
my_data = pd.read_csv(my_filepath, index_col="sl_no")

In [None]:
my_data.head()

In [None]:
my_data.shape

In [None]:
my_data.info

In [None]:
my_data.describe()

In [None]:
my_data.describe(include=object)

In [None]:
my_data.isna().sum()

Non numeric value in salary is replaced with 0 since these persons are not placed

In [None]:
my_data["salary"]=my_data["salary"].fillna(0)

In [None]:
my_data.columns

In [None]:
print(my_data.status.value_counts(),"\n",
my_data.status.value_counts(normalize=True))

In [None]:

my_data.status.value_counts().plot.bar(title='Count of employed and unemployed candidates')

* 1. We see that there majority of the candidates are employed.

Checking the number of candidates corresponding to the various categories  (categorical variables) below:

In [None]:
plt.figure(1) 
plt.subplot(131)
my_data.gender.value_counts().plot.bar(figsize=(10,8), title="Gender")
plt.subplot(132)
my_data.ssc_b.value_counts().plot.bar(figsize=(10,8),title="S S Board of Education")
plt.subplot(133)
my_data.hsc_b.value_counts().plot.bar(figsize=(10,8), title ="Higher School Board of Education")
plt.show()


* Most of the candidates are males.


In [None]:
plt.figure() 
my_data.hsc_s.value_counts().plot.bar(figsize=(8,6), title="Count of students per Specialization in Higher Secondary")
plt.show()

* Majority of students studied Commerce in Higher Secondary School.

In [None]:
my_data.degree_t.value_counts().plot.bar(figsize=(8,6),title="Count of students per UnderGrad Degree type")

* Most of the Candidates studied Comm&Mgmt in their undergraduate studies.


In [None]:
my_data.workex.value_counts().plot.bar(figsize=(8,6), title ="Count of students per Work experience")

* Most of the cadidates had no prior work experience

In [None]:
my_data.specialisation.value_counts().plot.bar(figsize=(8,6), title ="Count of students per Post Graduation(MBA)- Specialization")

* The prefared choice of post graduate studies amongst most of the students is Marketing and Finance

In [None]:
plt.figure(figsize=(16,6))
plt.subplot(141)
plt.title("Distribution of Secondary School Performance")
sns.distplot(a=my_data.ssc_p,kde=False)
plt.xlabel("Secondary School Exam Percentage scored")
plt.ylabel("Number of candidates ")
plt.subplot(142)
sns.boxplot(x=my_data.ssc_p)
plt.subplot(143)
sns.boxplot(x=my_data.status, y=my_data.ssc_p)
plt.subplot(144)
sns.swarmplot(x=my_data.status, y=my_data.ssc_p)
plt.show()

* A fairly normal distribution of cadidate secondary school exam score.
* Most of the candidates performed well.
* Majority of the candidates scored between 60 -70%
* Candidates that scored below 49 happen to be unemployed.
* Students that scored around 80 and above got hired.
* There are students who scored fairly good grades but are unemployed.

In [None]:

plt.figure(figsize=(16,6))
plt.subplot(141)
plt.title("Distribution of High School Performance")
sns.distplot(a=my_data.hsc_p,kde=False)
plt.xlabel("High School Exam Percentage scored")
plt.ylabel("Number of candidates ")
plt.subplot(142)
sns.boxplot(x=my_data.hsc_p)
plt.subplot(143)
sns.boxplot(x=my_data.status, y=my_data.hsc_p)
plt.subplot(144)
sns.swarmplot(x=my_data.status, y=my_data.hsc_p)
plt.show()

* Most of the students scored between 60% and 70% in High school
* Students that scored below 50% are not hired.
* Students that score very high marks are employed, however, there student(s) that score as high as 80% or more but are not hired.

In [None]:
plt.figure(figsize=(16,6))
plt.subplot(141)
plt.title("Distribution of UnderGrad Performance")
sns.distplot(a=my_data.degree_p,kde=False)
plt.xlabel("UnderGrad Exam Percentage scored")
plt.ylabel("Number of candidates")
plt.subplot(142)
sns.boxplot(x=my_data.degree_p)
plt.subplot(143)
sns.boxplot(x=my_data.status, y=my_data.degree_p)
plt.subplot(144)
sns.swarmplot(x=my_data.status, y=my_data.degree_p)
plt.show()

* Most of the students performed well in their undergraduate studies.
* Students that scored below 55% percent are not hired while those that scored very high grades are hired.
* There is a good number of candidates that performed well but could not land jobs.

In [None]:
plt.figure(figsize=(16,6))
plt.subplot(141)
plt.title("Employability test percentage (conducted by college)")
sns.distplot(a=my_data.etest_p,kde=False)
plt.xlabel("Employability Exam score")
plt.ylabel("Number of candidates")
plt.subplot(142)
sns.boxplot(x=my_data.etest_p)
plt.subplot(143)
sns.boxplot(x=my_data.status, y=my_data.etest_p)
plt.subplot(144)
sns.swarmplot(x=my_data.status, y=my_data.etest_p)
plt.show()

* The professional test does not influence the possibility of being employed much, because students that are unemployed did not perform poorly.

In [None]:
plt.figure(figsize=(16,6))
plt.subplot(141)
plt.title("Distribution of MBA percentage")
sns.distplot(a=my_data.mba_p,kde=False)
plt.subplot(142)
sns.boxplot(x=my_data.mba_p)
plt.subplot(143)
plt.title("Distribution of MBA percentage per placement status")
sns.boxplot(x=my_data.status, y=my_data.mba_p)
plt.subplot(144)
sns.swarmplot(x=my_data.status, y=my_data.mba_p)
plt.show()

* Most of the students scored between 57% - 66%. 
* Students that did not land jobs did not perform terribly in their graduate studies.

In [None]:
plt.figure(figsize=(16,6))
plt.subplot(141)
plt.title("Distribution of salary")
sns.distplot(a=my_data.salary,kde=False)
plt.subplot(142)
sns.boxplot(x=my_data.salary)
plt.subplot(143)
plt.title("Distribution of salary per placement status")
sns.boxplot(x=my_data.gender, y=my_data.salary)
plt.subplot(144)
sns.swarmplot(x=my_data.gender, y=my_data.salary)

plt.show()

* This is distribution is skew to the right.
* The presence of outliers is due to the fact that few of the candidates receive very huge salaries.
* Males receive higher salaries than females.

Below is the relationship between the employability status and the individual categories****

In [None]:
Gender=pd.crosstab(my_data['gender'],my_data['status']) 
Gender.div(Gender.sum(1), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))

ssbe = pd.crosstab(my_data.ssc_b,my_data.status)
ssbe.div(ssbe.sum(1), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))

hsbe = pd.crosstab(my_data.hsc_b,my_data.status)
hsbe.div(hsbe.sum(1), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))


In [None]:
ssbe = pd.crosstab(my_data.ssc_b,my_data.status)
ssbe.div(ssbe.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))

* The number of males employed are higher than that of the females
* Cadidates that attend the institutions managed by the central body are less employed as compare to the others

In [None]:
hscs = pd.crosstab(my_data.hsc_s,my_data.status)
hscs.div(hscs.sum(1),axis=0).plot(kind="bar", stacked=True, figsize=(4,4))

degreeT = pd.crosstab(my_data.degree_t,my_data.status)
degreeT.div(degreeT.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))

workX = pd.crosstab(my_data.workex,my_data.status)
workX.div(workX.sum(1), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))

specialty = pd.crosstab(my_data.specialisation, my_data.status)
specialty.div(specialty.sum(1), axis=0).plot(kind="bar", stacked="True", figsize=(4,4))

* Most of the students that studied arts in high school are the least employed
* Students that specialised in either Comm&Mgt or Sci&Tech are mostly employed.
* Most of the cadidates with work experience had job placements.
* Candidates that specialised in Marketing & Finance hard more job placements than those that studied Marketing & Human Resource.

# Correlation Amongst numeric features

In [None]:
mycorr = my_data[['ssc_p','hsc_p','degree_p','etest_p','mba_p','salary']].corr()
sns.heatmap(mycorr,annot=True)

In [None]:
sns.lmplot(x="degree_p", y="mba_p", hue="status", data=my_data)

Candidates with very high degree percentage had very high MBA percentage and they have job placement. Candidates with very low degree percentage, usually had a very low or low MBA percentage and are not placed. 
However, there are unmeployed candidates who had high degree percentage and high MBA score percentage. There is a positive correlation between degree score and MBA percentage for both employed and unemployed cadidates, but correlation between degree score and MBA percentage for employed candidates is stronger than unemployed candidates. 

In [None]:
sns.lmplot(x="ssc_p", y="hsc_p", hue="status", data=my_data)

Candidates that had high percentage in secondary school had high grades in higher school and are employed whereas those that had low grades in secondary school had low grades in higher school and are not employed

In [None]:
sns.lmplot(x="etest_p", y="mba_p", hue="status", data=my_data)

There is a poor relationship between the MBA percentage score and employability test percentage.

In [None]:
sns.lmplot(x="mba_p", y="salary", hue="specialisation", data=my_data)

* Candidates that specialized in Mk&Fin gain higher salaries than those in Mk&HR

# DATA PROCESSING FOR CLASSIFICATION

In [None]:
#Separating the the independent variable(X) and the target variable(y) from the dataset
X = my_data.drop('status',axis=1)
y = my_data.status

We have to change categorical data to numerical for the consumption of the models. This will be done by encoding the the categorical features

In [None]:
#OneHotEncoding will be deployed to change encode categorical variables with more than two unique items
degreedummy = pd.get_dummies(X.degree_t)
hscsdummy = pd.get_dummies(X.hsc_s)

In [None]:
X = pd.concat([X, degreedummy], axis=1)
X = pd.concat([X, hscsdummy], axis=1)

In [None]:
# drop original column of onehotencoded columns from X
X.drop("degree_t", axis = 1, inplace=True)
X.drop("hsc_s", axis = 1, inplace=True)

#Droping Salary since unemployed candidates automatically have 0 salary
X.drop("salary", axis = 1, inplace=True)

In [None]:
#LabelEncoder will be deployed to encode variables with two unique items
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
X['gender'] = le.fit_transform(X.gender)
X['ssc_b'] = le.fit_transform(X.ssc_b)
X['hsc_b'] = le.fit_transform(X.hsc_b)
X['workex'] = le.fit_transform(X.workex)
X['specialisation'] = le.fit_transform(X.specialisation)
y = le.fit_transform(y)

In [None]:
#Split data for training & validation
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3, random_state = 1,stratify=y)

In [None]:
#Standardizing the data to ensure all distributions are normal and also suppress outliers
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X.tail()

In [None]:
X.shape, y.shape

# **MACHINE LEARNING**

**TASK:** To predict whether a candidate got hired or not

**Implementing Classification models**

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

***DecisionTreeClassifier***

In [None]:
#Decision Tree


dmodel = DecisionTreeClassifier()

dmodel.fit(X_train,y_train)

y_train_predx = dmodel.predict(X_train)

y_test_predx = dmodel.predict(X_test)

In [None]:
accuracy_score(y_train_predx,y_train)

In [None]:
accuracy_score(y_test_predx,y_test)

In [None]:
print(classification_report(y_test,y_test_predx))

In [None]:
#confusion matrix: Idea for a yes/no predictions
print(confusion_matrix(y_test,y_test_predx))

sns.heatmap(confusion_matrix(y_test,y_test_predx),annot=True,lw =2,cbar=False)
plt.ylabel("True Values")
plt.xlabel("Predicted Values")
plt.title("CONFUSSION MATRIX VISUALIZATION")
plt.show()

In [None]:
importances=pd.Series(dmodel.feature_importances_, index=X.columns) 
importances.plot(kind='barh', figsize=(8,6))

* The model correctly predicted 35 candidates as employed and 13 as unemployed. It made a combined 17(thus 10+7) wrong predictions.
* Secondary percentage,MBA grade ,higher school percentage influence the model's decision most.
* A precision of 79% and recall of 80%

***RandomForestClassifier***

In [None]:
#Random Forest
rtmodel=RandomForestClassifier()

rtmodel.fit(X_train,y_train)
y_train_predr = rtmodel.predict(X_train)
y_test_predr = rtmodel.predict(X_test)

In [None]:
accuracy_score(y_train_predr,y_train)

In [None]:
accuracy_score(y_test_predr,y_test)

In [None]:
#confusion matrix: Idea for a yes/no predictions
print(confusion_matrix(y_test,y_test_predr))

sns.heatmap(confusion_matrix(y_test,y_test_predr),annot=True,lw =2,cbar=False)
plt.ylabel("True Values")
plt.xlabel("Predicted Values")
plt.title("CONFUSSION MATRIX VISUALIZATION")
plt.show()

In [None]:
print(classification_report(y_test,y_test_predr))

In [None]:
importances=pd.Series(rtmodel.feature_importances_, index=X.columns) 
importances.plot(kind='barh', figsize=(8,6))

* The model's prediction is 85% similar to the actual values.
* There were 42 true positives (correctly predicted as employed), 13 true negatives(correctly predicted as negative),7 type 1 errors or false positives (wrongly predicted as employed) and 3 type 2 errors (wrongly predicted as umeployed).
* The model had 84% precision and 85% recall.

**GridSearchCV** is deployed to retrieve the most optimum parameters for the random tree regression model

In [None]:
#Boost for random forest

from sklearn.model_selection import GridSearchCV

In [None]:
# Provide range for max_depth from 1 to 20 with an interval of 2 and from 1 to 200 with an interval of 20 for n_estimators 
paramgrid = {'max_depth': list(range(1, 20, 2)), 'n_estimators': list(range(1, 200, 20))}
grid_search=GridSearchCV(RandomForestClassifier(random_state=1),paramgrid)

In [None]:
# Fit the grid search model 
grid_search.fit(X_train,y_train)

In [None]:
# Estimating the optimized value 
grid_search.best_estimator_

In [None]:
RFCmodel = RandomForestClassifier(max_depth=3, n_estimators=141, random_state=1)
RFCmodel.fit(X_train,y_train)
y_train_predrfc = RFCmodel.predict(X_train)
y_test_predrfc = RFCmodel.predict(X_test)

In [None]:
accuracy_score(y_train_predrfc,y_train)

In [None]:
accuracy_score(y_test_predrfc,y_test)

In [None]:
#confusion matrix: Idea for a yes/no predictions
print(confusion_matrix(y_test,y_test_predrfc))

sns.heatmap(confusion_matrix(y_test,y_test_predrfc),annot=True,lw =2,cbar=False)
plt.ylabel("True Values")
plt.xlabel("Predicted Values")
plt.title("CONFUSSION MATRIX VISUALIZATION")
plt.show()

In [None]:
print(classification_report(y_test,y_test_predrfc))

In [None]:
importances=pd.Series(RFCmodel.feature_importances_, index=X.columns) 
importances.plot(kind='barh', figsize=(8,6))

* The model recorded 82% accuracy.
* There was 82% precision and 82% recall.
* Secondary school performance influenced the model's decion the most.
* 41 true positives(correctly predicted as employed), 12 true negatives(correctly predicted as unemployed), 8 false positives(type 1 error)(wrongly predicted unemployed candidate as employed) and 4 false negatives (type 2 error)(wrongly predicted employed candidate as unemployed).

***XGBoost***

In [None]:
#pip install xgboost

In [None]:
from xgboost import XGBClassifier

In [None]:
# fit model no training data
XGmodel = XGBClassifier()
XGmodel.fit(X_train, y_train)
y_train_predXG = XGmodel.predict(X_train)
y_test_predXG = XGmodel.predict(X_test)

In [None]:
accuracy_score(y_train_predXG,y_train)

In [None]:
accuracy_score(y_test_predXG,y_test)

In [None]:
#confusion matrix: Idea for a yes/no predictions
print(confusion_matrix(y_test,y_test_predXG))

sns.heatmap(confusion_matrix(y_test,y_test_predXG),annot=True,lw =2,cbar=False)
plt.ylabel("True Values")
plt.xlabel("Predicted Values")
plt.title("CONFUSSION MATRIX VISUALIZATION")
plt.show()

In [None]:
print(classification_report(y_test,y_test_predXG))

In [None]:
importances=pd.Series(XGmodel.feature_importances_, index=X.columns) 
importances.plot(kind='barh', figsize=(8,6))

* XGBoost model provided an accuracy score of 88%
* 88% precision and 88% recall
* Correctly predicted 40 candidates as employed, 14 correctedly predicted as unemployed and 11 wrong predictions.

**Hyperparameter tuning of XGBoost with Grid Search**

In [None]:
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

In [None]:
estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

In [None]:
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 10,
    cv = 10,
    verbose=True
)

In [None]:
# Fit the grid search model 
grid_search.fit(X_train,y_train)

In [None]:
# Estimating the optimized value 
grid_search.best_estimator_

In [None]:
XGmodel2 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=180, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=42, subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)
XGmodel2.fit(X_train,y_train)
y_train_predXG2 = XGmodel2.predict(X_train)
y_test_predXG2 = XGmodel2.predict(X_test)

In [None]:
accuracy_score(y_train_predXG2,y_train)

In [None]:
accuracy_score(y_test_predXG2,y_test)

In [None]:

print(confusion_matrix(y_test,y_test_predXG2))

sns.heatmap(confusion_matrix(y_test,y_test_predXG2),annot=True,lw =2,cbar=False)
plt.ylabel("True Values")
plt.xlabel("Predicted Values")
plt.title("CONFUSSION MATRIX VISUALIZATION")
plt.show()

In [None]:
print(classification_report(y_test,y_test_predXG2))

In [None]:
importances=pd.Series(XGmodel2.feature_importances_, index=X.columns) 
importances.plot(kind='barh', figsize=(8,6))

* The model predicted 40 candidates correctly as employed and 5 employed candidates wrongly as unemployed.
* 14 candidates were predicted correctly as unemployed and 6 unemployed candidates were predicted wrongly as unemployed.
* Predictions were made with 83% recall.
* High school performance had the most vital effect in decision making.

***LIGHTGBM***

In [None]:
from lightgbm import LGBMClassifier

In [None]:
# fit the model on the whole dataset
LGmodel = LGBMClassifier()
LGmodel.fit(X_train, y_train)
y_train_predL = LGmodel.predict(X_train)
y_test_predL = LGmodel.predict(X_test)

In [None]:
accuracy_score(y_train_predL,y_train)

In [None]:
accuracy_score(y_test_predL,y_test)

In [None]:
#confusion matrix: Idea for a yes/no predictions
print(confusion_matrix(y_test,y_test_predL))

sns.heatmap(confusion_matrix(y_test,y_test_predL),annot=True,lw =2,cbar=False)
plt.ylabel("True Values")
plt.xlabel("Predicted Values")
plt.title("CONFUSSION MATRIX VISUALIZATION")
plt.show()

In [None]:
print(classification_report(y_test,y_test_predL))

In [None]:
importances=pd.Series(LGmodel.feature_importances_, index=X.columns) 
importances.plot(kind='barh', figsize=(8,6))

* The lightGBM model delivered an accuracy rate of 78%.
* 78% Recall and 78% precision scores.
* Secondary school percentage, High School percentage, MBA percentage and degree score, greatly influenced the    model's decisions.
* Correctly predicted 38 candidates as employed, 13 correctedly predicted as unemployed and 14 wrong predictions.


***Catboost***

In [None]:
#pip install catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
CatModel = CatBoostClassifier()
CatModel.fit(X_train,y_train)
y_train_predCat = CatModel.predict(X_train)
y_test_predCat = CatModel.predict(X_test)

In [None]:
accuracy_score(y_train_predCat,y_train)

In [None]:
accuracy_score(y_test_predCat,y_test)

In [None]:
#confusion matrix: Idea for a yes/no predictions
print(confusion_matrix(y_test,y_test_predCat))

sns.heatmap(confusion_matrix(y_test,y_test_predCat),annot=True,lw =2,cbar=False)
plt.ylabel("True Values")
plt.xlabel("Predicted Values")
plt.title("CONFUSSION MATRIX VISUALIZATION")
plt.show()

In [None]:
print(classification_report(y_test,y_test_predCat))

In [None]:
importances=pd.Series(CatModel.feature_importances_, index=X.columns) 
importances.plot(kind='barh', figsize=(8,6))

* This model made classifications with 82% accuracy.
* 81% precision & recall, 82%
* Secondary school percentage influence the decisions most.
* The model correctly predicted 40 candidates as employed,it correctly predicted  13 candidates as unemployed. 5 candidates were wrongly predicted as employed and 7 candidates were wrongly predicted as unemployed

**ENSEMBLEMENT LEARNING**

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
eclf = VotingClassifier(estimators=[('dt', dmodel), ('rf1', rtmodel), ('rf2', RFCmodel),('cat', CatModel), ('lgb', LGmodel),('xgb', XGmodel),('xgb2', XGmodel2)], voting='soft', weights=[1, 1, 1, 1, 1, 1, 1])
eclf.fit(X_train, y_train)


In [None]:
y_train_pred_ens= eclf.predict(X_train)

In [None]:
y_test_pred_ens= eclf.predict(X_test)

In [None]:
accuracy_score(y_train_pred_ens,y_train)

In [None]:
accuracy_score(y_test_pred_ens,y_test)

In [None]:
#confusion matrix: Idea for a yes/no predictions
print(confusion_matrix(y_test,y_test_pred_ens))

sns.heatmap(confusion_matrix(y_test,y_test_pred_ens),annot=True,lw =2,cbar=False)
plt.ylabel("True Values")
plt.xlabel("Predicted Values")
plt.title("CONFUSSION MATRIX VISUALIZATION")
plt.show()

In [None]:
print(classification_report(y_test,y_test_pred_ens))

* Ensemblement learning(soft voting) produced a recall of 85%. Thus out of all positive classes, 85% was predicted correctly.
* Precision was 84%, of all positve classes predicted correctly, 84% were actually positive.
* 41 candidates were correctly predicted as employed, 4 were wrongly predicted as unemployed.
* 14 candidates where were correctly predicted as unemployed while 6 were unemployed but model predicted them as employed.
* The model made predictions with 85% accuracy across both classes.

In [None]:
eclf2 = VotingClassifier(estimators=[('dt', dmodel), ('rf1', rtmodel), ('rf2', RFCmodel),('cat', CatModel), ('lgb', LGmodel),('xgb', XGmodel),('xgb2', XGmodel2)], voting='hard', weights=[1, 1, 1, 1, 1, 1, 1])
eclf2.fit(X_train, y_train)

In [None]:
y_train_pred_ens2 = eclf2.predict(X_train)

In [None]:
y_test_pred_ens2 = eclf2.predict(X_test)

In [None]:
accuracy_score(y_train_pred_ens2,y_train)

In [None]:
accuracy_score(y_test_pred_ens2,y_test)

In [None]:
#confusion matrix: Idea for a yes/no predictions
print(confusion_matrix(y_test,y_test_pred_ens2))

sns.heatmap(confusion_matrix(y_test,y_test_pred_ens2),annot=True,lw =2,cbar=False)
plt.ylabel("True Values")
plt.xlabel("Predicted Values")
plt.title("CONFUSSION MATRIX VISUALIZATION")
plt.show()

In [None]:
print(classification_report(y_test,y_test_pred_ens2))

* Ensemblement learning(hard voting) produced a recall of 82%. Thus out of all positive classes, 82% was predicted correctly.
* Precision was 81%, of all positve classes predicted correctly, 81% were actually positive.
* 40 candidates were correctly predicted as employed, 7 were wrongly predicted as unemployed.
* 13 candidates where were correctly predicted as unemployed while 5 were unemployed but model predicted them as employed.
* The model made predictions with 82% accuracy across both classes.

# CONCLUSION

* Gender does not inflence your chances of getting hired.
* Academic grades play a massive role in your chances of getting hired. Very high grades increases your chances of getting hired.
* Candidates that specialised in Marketing and Finance relatively gain higher salaries than their counterparts in Marketing and Human Resource.
* Employability test performance do not enhance your chances of getting hired.

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.constraints import max_norm

In [None]:
model = Sequential()
# input layer
model.add(Dense(16,  activation='relu'))
model.add(Dropout(0.2))

# hidden layer
model.add(Dense(39, activation='relu'))
model.add(Dropout(0.2))

# hidden layer
model.add(Dense(19, activation='relu'))
model.add(Dropout(0.2))

# output layer
model.add(Dense(units=1,activation='sigmoid'))

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam')

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=25)

In [None]:
model.fit(x=X_train, 
          y=y_train, 
          epochs=500,
          batch_size=100,
          #validation_data=(X_test, y_test),
          validation_split=0.2,
          callbacks=[early_stop]
          )

In [None]:
#from tensorflow.keras.models import load_model
#model.save('project_model.h5')  

In [None]:
model_loss = pd.DataFrame(model.history.history)
model_loss.plot()

In [None]:
predictions = model.predict_classes(X_test)

In [None]:
#confusion matrix: Idea for a yes/no predictions
print(confusion_matrix(y_test,predictions))

sns.heatmap(confusion_matrix(y_test,predictions),annot=True,lw =2,cbar=False)
plt.ylabel("True Values")
plt.xlabel("Predicted Values")
plt.title("CONFUSSION MATRIX VISUALIZATION")
plt.show()

In [None]:
print(classification_report(y_test,predictions))