# Introduction  
## About the Dataset
This data set consists of Placement data of students in Jain University, Bangalore. It includes secondary and higher secondary school percentage and specialization. It also includes degree specialization, type and Work experience and salary offers to the placed students

# Problem Statement
**To predict whether or not a candidate will be placed(or employed) on the basis of his/her Secondary %, Higher Secondary %, Undergraduate Degree %, MBA % and Employability Test %**

# The Methodology
1. **Loading and Cleaning + Preprocessing the Data**
2. **Exploratory Data Analysis(EDA)**
    * Pairplot
    * Clustermap to Visualize Correlation
    * Bubble Plot(s)
    * Tree Chart
    * Pie Chart
    * Histogram
    * Trendline
    * Violin Plot
    * Swarm Plot
3. **Predictive Modelling**
    * Logistic Regression
    * Naive Bayes Classifier
    * Random Forest Classifier
    * Support Vector Machine(SVM) Classifier
    * Deep Neural Network(DNN)


# Importing Relevant Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

# Loading the Data

In [None]:
data_path = '../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv'

In [None]:
raw_csv_data = pd.read_csv(data_path)
raw_csv_data

# Preprocessing the Data

In [None]:
df_comp = raw_csv_data.copy()

In [None]:
df_comp.isna().sum()

In [None]:
df_comp.fillna(df_comp['salary'].mean(), inplace = True)

#### Dropping sl_no as it is an insignificant feature

In [None]:
del df_comp['sl_no']

In [None]:
df_comp

# Exploratory Data Analysis

In [None]:
sns.countplot(df_comp.status)

## Pairplot

In [None]:
sns.pairplot(df_comp)

## Correlation between Salary, Employability Test %, Secondary Education % and Higher Secondary Education %

In [None]:
cor = df_comp.loc[:,["hsc_p","ssc_p",'etest_p',"salary"]]
sns.clustermap(cor.corr(), center=0, cmap="vlag",
               linewidths=.75, figsize=(10, 5))

##  Bubble Plot

In [None]:
!pip install bubbly
!pip install chart_studio

## Gender + Employability Test % v/s Salary

In [None]:
df_comp_bp = df_comp.head(30)
from bubbly.bubbly import bubbleplot 
from plotly.offline import iplot
import chart_studio.plotly as py


figure = bubbleplot(dataset=df_comp_bp, x_column='etest_p', y_column='salary', 
    bubble_column='gender', size_column='salary', color_column='gender', 
    x_logscale=True, scale_bubble=2, height=350)

iplot(figure)

## Specialisation + Employability Test v/s Salary

In [None]:
df_comp_bp = df_comp.head(30)
from bubbly.bubbly import bubbleplot 
from plotly.offline import iplot
import chart_studio.plotly as py


figure = bubbleplot(dataset=df_comp_bp, x_column='etest_p', y_column='salary', 
    bubble_column='specialisation', size_column='salary', color_column='specialisation', 
    x_logscale=True, scale_bubble=2, height=350)

iplot(figure)

## TreeChart

In [None]:
import plotly.express as px

## Specialization v/s Salary

In [None]:
df_tree = df_comp.groupby(["hsc_b","specialisation"])[["salary"]].mean().reset_index()

fig = px.treemap(df_tree, path=['hsc_b','specialisation'], values='salary',
                  color='salary', hover_data=['specialisation'],
                  color_continuous_scale='rainbow')
fig.show()

## Work Experience + Undergraduate Degree v/s Salary

In [None]:
df_tree = df_comp.groupby(["workex","degree_t"])[["salary"]].mean().reset_index()

fig = px.treemap(df_tree, path=['workex','degree_t'], values='salary',
                  color='salary', hover_data=['degree_t'],
                  color_continuous_scale='rainbow')
fig.show()

## Gender + Undergraduate Degree v/s Employability Test %

In [None]:
df_tree_1 = df_comp.copy()
df_tree_1['status'] = df_tree_1['status'].map({'Placed':1, 'Not Placed':0})

In [None]:
df_tree = df_tree_1.groupby(["gender","degree_t"])[["etest_p"]].mean().reset_index()

fig = px.treemap(df_tree, path=['gender','degree_t'], values='etest_p',
                  color='etest_p', hover_data=['degree_t'],
                  color_continuous_scale='rainbow')
fig.show()

## Pie Chart

## Gender v/s Salary

In [None]:
df_pie = df_comp.groupby(["gender"])[["salary"]].mean().reset_index()

fig = px.pie(df_pie,
             values="salary",
             names="gender",
             template="seaborn")
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.show()

## Histogram

## Gender + Degree% v/s Count of Status(# placed/not-placed)

In [None]:
fig = px.histogram(df_comp, x="degree_p", y="status", color="gender")
fig.show()

## Trendline

## Degree% v/s Salary

In [None]:
fig = px.scatter(df_comp, x="degree_p", y="salary", trendline="ols")
fig.show()

## Employability Test % v/s Salary

In [None]:
fig = px.scatter(df_comp, x="etest_p", y="salary", trendline="ols")
fig.show()

## Violin Plot

In [None]:
plt.figure(figsize=(10,6))
ax = sns.violinplot(x="degree_t", y="salary", hue="specialisation",
                    data=df_comp, palette="muted")

## Swarm Plot

## Gender v/s Salary

In [None]:
ax = sns.swarmplot(x="gender", y="salary", data= df_comp)

## Work Experience v/s Salary

In [None]:
ax = sns.swarmplot(x="workex", y="salary", data=df_comp)

In [None]:
df = df_comp.copy()

# Logistic Regression

In [None]:
import statsmodels.api as sm
from scipy import stats
from sklearn.model_selection import train_test_split
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

### Preparing Data for Logistic Regression

In [None]:
df_log = df.copy()

In [None]:
df['hsc_b'].unique()

In [None]:
df_log.ssc_b = df_log['ssc_b'].map({'Others':1, 'Central':0})
df_log.hsc_b = df_log['hsc_b'].map({'Others':1, 'Central':0})
df_log.hsc_s = df_log['hsc_s'].map({'Arts':2, 'Commerce':1, 'Science':0})
df_log.degree_t = df_log['degree_t'].map({'Others':2, 'Comm&Mgmt':1, 'Sci&Tech':0})
df_log.workex =  df_log['workex'].map({'Yes':1, 'No':0})
df_log.specalisation = df_log['specialisation'].map({'Mkt&HR':1, 'Mkt&Fin':0})
df_log.status = df_log['status'].map({'Placed':1, 'Not Placed':0})
df_log.gender = df_log['gender'].map({'F':1,'M':0})

In [None]:
df_log.info()

### Splitting the Data into Training and Testing Data with an 80:20 Split

In [None]:
df_log

In [None]:
inputs = df_log[['ssc_p', 'hsc_p', 'degree_p','workex', 'etest_p', 'mba_p']]
targets = df_log['status']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(inputs, targets, test_size = 0.2, random_state = 365)

## The Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
logreg = LogisticRegression()
results_log = logreg.fit(x_train,y_train)

In [None]:
y_pred=logreg.predict(x_test)

### Confusion Martix
### [True Positive, False Negative]
### [False Positive, True Negative]

#### There is also a list of rates that are often computed from a confusion matrix for a binary classifier:
#### Accuracy: Overall, how often is the classifier correct?
#### Accuracy = (TP+TN)/total
#### Misclassification Rate(Error Rate): Overall, how often is it wrong?
#### Misclassification Rate = (FP+FN)/total
#### True Positive Rate(Sensitivity or Recall): When it’s actually yes, how often does it predict yes?
#### True Positive Rate = TP/actual yes
#### False Positive Rate: When it’s actually no, how often does it predict yes?
#### False Positive Rate=FP/actual no
#### True Negative Rate(Specificity): When it’s actually no, how often does it predict no?
#### True Negative Rate=TN/actual no
#### Precision: When it predicts yes, how often is it correct?
#### Precision=TP/predicted yes
#### Prevalence: How often does the yes condition actually occur in our sample?
#### Prevalence=actual yes/total

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
class_names = [0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(1)
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="mako" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion Matrix', y=1.1, size = 24)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

### ROC Curve
#### Receiver Operating Characteristic(ROC) curve is a plot of the true positive rate(Recall) against the false positive rate. It shows the tradeoff between sensitivity and specificity.
#### AUC(Area Under Curve) score for the case is 0.96. AUC score 1 represents perfect classifier, and 0.5 represents a worthless classifier.

In [None]:
y_pred_proba = logreg.predict_proba(x_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

# Naive Bayes Classification

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_train, y_train);

In [None]:
pred = model.predict(x_test)

In [None]:
acc = model.score(x_test,y_test)
print("Accuracy = " + str((acc*100).round(3))+"%")

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
model.fit(x_train,y_train)

In [None]:
y_pred = model.predict(x_test)

## Validating Performance of Random Forest Model

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(y_test,y_pred)
cm

In [None]:
class_names = [0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(1)
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(cm), annot=True, cmap="mako" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion Matrix', y=1.1, size = 24)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print("Accuracy = "+ str(((model.score(x_test,y_test))*100).round(3))+"%")

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [None]:
rfc_cv_score = cross_val_score(model, x_test, y_test, cv=10, scoring='roc_auc')

In [None]:
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

In [None]:
from sklearn.metrics import plot_roc_curve

### ROC (Receiver Operating Charateristic) Curve

In [None]:
rfc_ROC_disp = plot_roc_curve(model, x_test, y_test)
plt.show()

# Support Vector Machine Classifier

In [None]:
from sklearn import svm

In [None]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(x_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(x_test)

In [None]:
print("Accuracy:",str(((metrics.accuracy_score(y_test, y_pred))*100).round(3)) + "%")

In [None]:
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

# Deep Neural Network(DNN) for Predicting Placement Status

In [None]:
#import numpy as np
import tensorflow as tf
from sklearn import preprocessing

In [None]:
unscaled_inputs_all = df[['ssc_p','hsc_p','degree_p','etest_p','mba_p']]
targets_all = df_log['status']

### Balancing the Dataset

In [None]:
num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = []

In [None]:
for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter +=1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis = 0)

targets_equal_priors = np.delete(targets_all, indices_to_remove, axis = 0)

### Standardizing the Inputs

In [None]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

### Shuffling the Data

In [None]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

In [None]:
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_all[shuffled_indices]

### Splitting the Data into Training, Validation and Testing Set

In [None]:
samples_count = shuffled_inputs.shape[0]

In [None]:
train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.8 * samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

In [None]:
train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count + validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count + validation_samples_count]

test_inputs = shuffled_inputs[train_samples_count + validation_samples_count:]
test_targets = shuffled_targets[train_samples_count + validation_samples_count:]

### Saving the DataFrames in .npz format

In [None]:
np.savez('placement_train_data', inputs = train_inputs, targets = train_targets)
np.savez('placement_validation_data', inputs = validation_inputs, targets = validation_targets)
np.savez('placement_test_data', inputs = test_inputs, targets = test_targets)

## Outlining the DNN Model

### Loading the .npz files

In [None]:
npz = np.load('/kaggle/working/placement_train_data.npz')
train_inputs, train_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

npz = np.load('/kaggle/working/placement_validation_data.npz')
validation_inputs, validation_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

npz = np.load('/kaggle/working/placement_test_data.npz')
test_inputs, test_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)


## The Model

In [None]:
input_size = 5
output_size = 2

hidden_layer_size = 55

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation='softmax')
])

model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

batch_size = 55
max_epochs = 100

#### Early Stopping

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience = 2)

#### Fitting the Data to the Model

In [None]:
history = model.fit(train_inputs, train_targets,
         batch_size = batch_size,
         epochs= max_epochs,
         callbacks = [early_stopping],
         validation_data = (validation_inputs, validation_targets),
         verbose = 1)

In [None]:
model.save_weights("model.h5")

### Visualising Training

In [None]:
plt.plot(history.history['loss'], color = 'red', label = 'Training Loss')
plt.plot(history.history['val_loss'], color = 'blue', label = 'Validation Loss')
plt.legend()
plt.show()


In [None]:
plt.plot(history.history['accuracy'], color = 'red', label = 'Training Accuracy')
plt.plot(history.history['val_accuracy'], color = 'blue', label = 'Validation Accuracy')
plt.legend()
plt.show()

# Hence, We see that the DNN Model is the best perfroming model with 95.35% Validation Accuracy

## Kindly upvote if you found this notebook useful! Thank you!