In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt

In [None]:
def plot_decision_boundaries(X, y, model_class, **model_params):
    try:
        X = np.array(X)
        y = np.array(y).flatten()
    except:
        print("Coercing input data to NumPy arrays failed")
    # Reduces to the first two columns of data
    reduced_data = np.hstack([X[:, 0:1],X[:, 1:2]])
    # Instantiate the model object
    model = model_class(**model_params)
    # Fits the model with the reduced data
    model.fit(reduced_data, y)
    y[y=='Not Placed']=0
    y[y=='Placed']=1
    
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, m_max]x[y_min, y_max].    

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    # Meshgrid creation
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh using the model.
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])    
    
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))

    # Predictions to obtain the classification results
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    Z[Z=='Not Placed']=0
    Z[Z=='Placed']=1
    # Plotting
    plt.contourf(xx, yy, Z, alpha=0.4)
    plt.scatter(X[:, 0], X[:, 1],c=y, alpha=0.8)
    plt.xlabel("ssc_p",fontsize=15)
    plt.ylabel("hsc_p",fontsize=15)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.legend(['Not Placed','Placed'])
    plt.show()



# Problem Statement
The college placement cell has decided to organize special training programming tied up with a placement training school. However due to funds limitation the college has decided to make it compulsory only for those who are likely to not get placed.
OBJECTIVES: 
Detect a list of people who will not get placed.
Which factor influenced a candidate in getting placed?
Does percentage matters for one to get placed? 

# Data Overview

In [None]:
data=pd.read_csv("/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv").drop('sl_no',axis=1)

In [None]:
data.head()

In [None]:
data.info()

From above information we know salary has null values and we know salary depends on whether the person is placed and hence isnt needed.

In [None]:
c1=data['status'].values.copy()
c1[c1=="Not Placed"]=0
c1[c1=="Placed"]=1
data.plot(kind='scatter',y='ssc_p',x='hsc_p',s='degree_p',c=c1,cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend(["Not Placed","Placed"])
plt.show()

In [None]:
data.describe()

# Data Imbalance 
The ouptut class has imbalnce since there are only 31% Negative class and 69% positve. Since we negative class is important to us are metric should be chosen keeping "Not Placed" class in mind.

In [None]:
data["status"].value_counts()/len(data)

# Stakeholders(highlight for which you will be working)
1. Students
2. Placement Cell
3. Training School
4. College 
5. Recruiters

# Business metric
Our aim is to segment the student list into "Placed" or "Not Placed" such that the "Not Placed" student do not get false categorised. 

# Data science metric 
True Negative Rate is the metric used to evaluate model performance.
It is i given by:
TNR=True Negative/(True Negative+False Positive)
We choose TNR because we need to find all those student who will not get placed. For this problem statement it is okay to get a few False Negatives i.e. a "Placed" student can be allowed classified into "Not Placed" but otherwise is dangerous. Also since Negative rate is less in number our model performance should prefer negative class performance. 

In [None]:
import matplotlib.pyplot as plt
plt.imshow(plt.imread("../input/confusion/Confusion.png"))
plt.show()

In [None]:
from sklearn.metrics import make_scorer,confusion_matrix

In [None]:
def custom(x,y,beta=2):
    r=confusion_matrix(x,y)[0,0]/np.sum(confusion_matrix(x,y)[0,:])
    p=confusion_matrix(x,y)[0,0]/np.sum(confusion_matrix(x,y)[:,0])
    fbeta =((1+beta**2)*(r*p))/((beta**2)*p+r)
    return r
    
TNR = make_scorer(custom,greater_is_better=True)

## Stratified Train-Test Split
Stratified Train Test Split keeps the ratio of output classes same in train and test sets.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold

In [None]:
strat=StratifiedShuffleSplit(n_splits=1,test_size=0.15,random_state=42)

In [None]:
for train_index, test_index in strat.split(data,data['status']):
    strat_train=data.loc[train_index]
    strat_test=data.loc[test_index]

Below are the ratios of "Placed" category in train and test sets

In [None]:
print("Tain_Placed ",strat_train["status"].value_counts()["Placed"]/len(strat_train["status"]))
print("Test_Placed ",strat_test["status"].value_counts()["Placed"]/len(strat_test["status"]))

Spliting status attribute from attribute data. Removing Salary attribute from the attribute data.

In [None]:
train_X=strat_train.drop(['status','salary'],axis=1)
train_Y=strat_train["status"].values
test_X=strat_test.drop(['status','salary'],axis=1)
test_Y=strat_test["status"].values

# Data Cleaning And Preparing for the model.

In [None]:
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import LabelBinarizer,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline,FeatureUnion

ATTRIBUTE LIST: ['sl_no', 'gender', 'ssc_p', 'ssc_b', 'hsc_p', 'hsc_b', 'hsc_s',
       'degree_p', 'degree_t', 'workex', 'etest_p', 'specialisation', 'mba_p']

In [None]:
numerical_data=['ssc_p','hsc_p','degree_p','etest_p','mba_p']
categroical_data=['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex','specialisation']


## Preprocessing Pipeline
### 1. Numerical Data Pipeline
    1.1. Numerical data is extracted out of the data
    1.2. Data is standard Scaled ((x-mean)/std)
### 2. Categorical Data Pipeline
    2.1. Cateegorical data is extracted out of the data
    2.2. Data is transformed into one hot encoding(Binarizing) 
### 3. Feature Union
    Both the pipelines are merged such that numerical and categorical transformed data are horizontaly stacked.

* **DataFrame_selector**: this class splits numerical and categorical attributes
* **CustomLabelBinarizer**: Transforms data into one hot vectors
* **StandardScaler**: Standardises the numerical data ((x-mean)/std)

In [None]:
class DataFrame_selector(BaseEstimator,TransformerMixin):
    def __init__(self,column_list):
        self.column_list=column_list
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.column_list].values
class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        enc = LabelBinarizer(sparse_output=self.sparse_output)
        for i in range(len(X[0,:])):
            if i ==0:
                out=enc.fit_transform(X[:,i])
            else:
                out=np.hstack((out,enc.fit_transform(X[:,i])))
        return out

### Numerical Pipeline

In [None]:
Numerical_pipeline=Pipeline([
    ('df_selector',DataFrame_selector(numerical_data)),
    ('StandardScaler',StandardScaler())
])
Numerical_pipeline.fit_transform(train_X).shape

### Categorical Pipeline

In [None]:
Categorical_pipeline=Pipeline([
    ('df_selector',DataFrame_selector(categroical_data)),
    ('binary',CustomLabelBinarizer(sparse_output=False))
])
Categorical_pipeline.fit_transform(train_X).shape

### Feature Union

In [None]:
final=FeatureUnion(transformer_list=[
    ('numerical',Numerical_pipeline),
    ('categorical',Categorical_pipeline)
])
final.fit_transform(train_X).shape

## Pre-processing train and test data

### Train data transformed by our Pipeline

In [None]:
proccessed_train=final.fit_transform(train_X)

### Test data transformed by our Pipeline

In [None]:
proccessed_test=final.fit_transform(test_X)

In [None]:
FINAL_X=final.fit_transform(data.drop(['status','salary'],axis=1))
FINAL_Y=data["status"].values

# Modeling and Model Selection


## We will train 
1. Random Forest Classifier
2. Decision Tree Classifier
3. Logistic Regression
4. K Nearest Neighbors Classifier
5. Gaussian Naive Bayes
6. Support Vector Machine Classifier
#### We will tune the Hyperparameters against 10 fold crossvalidation and choose the Hyperparameters that yeild the best validation score for each model. After getting the best models for each classifier we predict against the test set to get the final performance of the model. The best performing model on TNR metric will be choosen. We use gridsearch for hyperparameter tuning 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


In [None]:
split=StratifiedKFold(n_splits=10,random_state=42)

## Logistic Regression

In [None]:
lgr=LogisticRegression()

param_grid = [{}]
grid_search = GridSearchCV(lgr, param_grid, cv=split,scoring=TNR)
lgr_model=grid_search.fit(X=proccessed_train,y=train_Y)

## Decision Tree

In [None]:
DTC=DecisionTreeClassifier(random_state=1,class_weight="balanced",splitter='random')
param_grid = [
    {'max_depth':[1,2,3,4],'max_features':["auto", "sqrt", "log2"],'criterion':["gini", "entropy"]}
  ]
grid = GridSearchCV(DTC, param_grid, cv=split,scoring=TNR)
DTC_model=grid.fit(X=proccessed_train,y=train_Y)
DTC_model.best_estimator_

In [None]:
import matplotlib.pyplot as plt
from sklearn import tree 
cl=['ssc_p','hsc_p','degree_p','etest_p','mba_p']+['gender', 'ssc_b', 'hsc_b', 'Commerce','Science','Arts', 'Sci&Tech','Comm&Mgmt','Others','workex','specialisation']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (10,10), dpi=100)
tree.plot_tree(DTC_model.best_estimator_,class_names=['Placed','Not Placed'],feature_names=cl)
fig.savefig('imagename.png')

## Gaussian Naive Bayes

In [None]:
GNB=GaussianNB()
param_grid = [
    {'var_smoothing':[5,4,3,2.15,2.1,2,1,1e-2,1e-6,1e-7,1e-8,1e-9,1e-10]}]
grid = GridSearchCV(GNB, param_grid, cv=10,scoring=TNR)
GNB_model=grid.fit(X=proccessed_train,y=train_Y)
GNB_model.best_estimator_

In [None]:
plot_decision_boundaries(FINAL_X, FINAL_Y, GaussianNB,**GNB_model.best_params_)

## Random Forest Classifier

In [None]:
RFC=RandomForestClassifier()
param_grid = [
    {'max_depth':[1,2,3],'n_estimators':[6,7,8],'criterion' : ["gini", "entropy"],'max_features':["auto", "sqrt", "log2"],'random_state':[1],'class_weight' : ["balanced", "balanced_subsample",None]}]
grid = GridSearchCV(RFC, param_grid, cv=split,scoring=TNR)
RFC_model=grid.fit(X=proccessed_train,y=train_Y)
RFC_model.best_estimator_

In [None]:
plot_decision_boundaries(FINAL_X, FINAL_Y, RandomForestClassifier,**RFC_model.best_params_)

## K Nearest Neighbors Classifier

In [None]:
KNC=KNeighborsClassifier()
param_grid = [{'n_neighbors':[2,3,4],'leaf_size':[1,10],'p':[1,2,3,4],'weights':['uniform', 'distance'],'algorithm':['auto', 'ball_tree', 'kd_tree']}]
grid = GridSearchCV(KNC, param_grid, cv=split,scoring=TNR)
KNC_model=grid.fit(X=proccessed_train,y=train_Y)
KNC_model.best_estimator_

In [None]:
from sklearn.inspection import permutation_importance
results = permutation_importance(KNC_model, FINAL_X, FINAL_Y, scoring=TNR)
# get importance
importance = results.importances_mean
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
plot_decision_boundaries(FINAL_X, FINAL_Y, KNeighborsClassifier,**KNC_model.best_params_)

## Support Vector Machine Classifier

In [None]:
svc=SVC(probability=True)
param_grid = [{'C':[0.1,0.5,1,2,3,3.5,4],'kernel':['linear', 'rbf'],'degree':[1,2,3,4,5],'gamma' : ['scale', 'auto'],'class_weight' : [None,'balanced']}]
grid = GridSearchCV(svc, param_grid, cv=split,scoring=TNR)
SVC_model=grid.fit(X=proccessed_train,y=train_Y)
SVC_model.best_estimator_

In [None]:
plot_decision_boundaries(FINAL_X, FINAL_Y, SVC,**SVC_model.best_params_)

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
voting_clf=VotingClassifier(
    estimators=[('lr',lgr_model.best_estimator_),
            ('GNB',GNB_model.best_estimator_),
            ('DTC',DTC_model.best_estimator_),
            ('RFC',RFC_model.best_estimator_),
            ('KNN',KNC_model.best_estimator_),
            ('SVC',SVC_model.best_estimator_)
           ],
voting='soft'
)

voting_model=voting_clf.fit(proccessed_train,train_Y)

# Accuracy of Model 

## Scoring on the Test Set

In [None]:
print("LOGISTIC REGRESSION CLASSIFIER    ",lgr_model.score(proccessed_test,test_Y))
print("GAUSSIAN NAIVE BAYES CLASSIFIER   ",GNB_model.score(proccessed_test,test_Y))
print("DECISION TREE CLASSIFIER          ",DTC_model.score(proccessed_test,test_Y))
print("RANDOM FOREST CLASSIFIER          ",RFC_model.score(proccessed_test,test_Y))
print("K NEAREST NEIGHBORS CLASSIFIER    ",KNC_model.score(proccessed_test,test_Y))
print("SUPPORT VECTOR MACHINE CLASSIFIER ",SVC_model.score(proccessed_test,test_Y))
print("VOTING                            ",voting_model.score(proccessed_test,test_Y))

In [None]:
y=[
    lgr_model.score(proccessed_test,test_Y),
    GNB_model.score(proccessed_test,test_Y),
    DTC_model.score(proccessed_test,test_Y),
    RFC_model.score(proccessed_test,test_Y),
    KNC_model.score(proccessed_test,test_Y),
    SVC_model.score(proccessed_test,test_Y),
    voting_model.score(proccessed_test,test_Y)
]

x=[
    "LOGISTIC REGRESSION CLASSIFIER","GAUSSIAN NAIVE BAYES CLASSIFIER","DECISION TREE CLASSIFIER",
    "RANDOM FOREST CLASSIFIER",
    "K NEAREST NEIGHBORS CLASSIFIER",
    "SUPPORT VECTOR MACHINE CLASSIFIER",
    "VOTING CLASSIFIER (SOFT VOTING)"
]

fig = plt.figure(figsize=[18,10])
ax = fig.add_axes([0,0,1,1])
ax.bar(x,y,width=0.5)
plt.show()

# Global score

## Let's Check how well have the models genralised

In [None]:
print("LOR ",custom(FINAL_Y,lgr_model.predict(FINAL_X)))
print("GNB ",custom(FINAL_Y,GNB_model.predict(FINAL_X)))
print("DTC ",custom(FINAL_Y,DTC_model.predict(FINAL_X)))
print("RFC ",custom(FINAL_Y,RFC_model.predict(FINAL_X)))
print("KNC ",custom(FINAL_Y,KNC_model.predict(FINAL_X)))
print("SVC ",custom(FINAL_Y,SVC_model.predict(FINAL_X)))
print("VOTING ",custom(FINAL_Y,voting_model.predict(FINAL_X)))

In [None]:
y=[ 
    custom(FINAL_Y,lgr_model.predict(FINAL_X)),
    custom(FINAL_Y,GNB_model.predict(FINAL_X)),
    custom(FINAL_Y,DTC_model.predict(FINAL_X)),
    custom(FINAL_Y,RFC_model.predict(FINAL_X)),
    custom(FINAL_Y,KNC_model.predict(FINAL_X)),
    custom(FINAL_Y,SVC_model.predict(FINAL_X)),
    custom(FINAL_Y,voting_model.predict(FINAL_X))
]
x=["LOGISTIC REGRESSION CLASSIFIER",
"GAUSSIAN NAIVE BAYES CLASSIFIER",
"DECISION TREE CLASSIFIER",
"RANDOM FOREST CLASSIFIER",
"K NEAREST NEIGHBORS CLASSIFIER",
"SUPPORT VECTOR MACHINE CLASSIFIER",
  "VOTING CLASSIFIER (SOFT VOTING)"]
fig = plt.figure(figsize=[18,10])

ax = fig.add_axes([0,0,1,1])
ax.bar(x,y,width=0.5)
plt.show()

# K Nearest Neighbors Classifier is best performing model and we select it with hyper parameters:

In [None]:
KNC_model.best_params_