In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

 # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import libraries

import numpy as np # linear algebra
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder

### 1.Import Dataset

In [None]:
recruitment = pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')
recruitment.head(5)

### 2.Data Cleaning and Processing

In [None]:
recruitment.shape

In [None]:
recruitment.info()

In [None]:
recruitment.isnull().sum()

### Inference:

1. There are 67 null values in our data, which means 67 unplaced students.
2. We can't drop these values as this will provide a valuable information on why candidates failed to get hired.
3. We can't impute it with mean/median values and it will go against the context of this dataset and it will show unhired candidates got salary.
4. Our best way to deal with these null values is to impute it with '0'.

In [None]:
# Making null value as zero.
recruitment.fillna(0,inplace=True)
recruitment.head(5)

In [None]:
## Datatypes of columns
recruitment.dtypes

In [None]:
## Drop "sl.no." as it will not help in model learning.

recruitment.drop('sl_no', axis=1, inplace=True)

"ssc_b" and "hsc_b" Features are providing information about the board in which candidate pursued his/her 10th and 12th. These features will not weigh in model learning, Hence dropping these features. 

In [None]:
recruitment.drop(['hsc_b','ssc_b'], axis=1, inplace=True)

In [None]:
recruitment.head()

### 3. Outliers

Using boxplot to check for outliers.

In [None]:
plt.figure(figsize=(15,10))

ax = plt.subplot(331)
plt.boxplot(recruitment['ssc_p'])
ax.set_title('Secondary School Percentage')

ax = plt.subplot(332)
plt.boxplot(recruitment['hsc_p'])
ax.set_title('Higher Secondary School Percentage')

ax = plt.subplot(333)
plt.boxplot(recruitment['degree_p'])
ax.set_title('Degree Percentage')

ax = plt.subplot(334)
plt.boxplot(recruitment['mba_p'])
ax.set_title('MBA Percentage')

ax = plt.subplot(335)
plt.boxplot(recruitment['etest_p'])
ax.set_title('Employibility Percentage')

"Higher Secondary Percentage" (hsc_p) has most outliers. Any other features do not have outliers. 

Removing these outliers from the data.

In [None]:
Q1 = recruitment['hsc_p'].quantile(0.25)
Q3 = recruitment['hsc_p'].quantile(0.75)
IQR = Q3 - Q1

recruitment_processed= recruitment.loc[(recruitment['hsc_p'] >= Q1 - 1.5 * IQR) & (recruitment['hsc_p'] <= Q3 + 1.5 *IQR)]

In [None]:
plt.figure(figsize=(8,5))


plt.boxplot(recruitment_processed['hsc_p'])
plt.title('Higher Secondary School Percentage')


We have removed the outliers from data, now there are no outliers in the hsc_p data.

### 4. Visualizations

#### 1. Categorical Columns: 

In [None]:
categorical_columns = recruitment_processed.select_dtypes("object").columns
categorical_columns

In [None]:
plt.figure(figsize = (15, 7))


#Gender
plt.subplot(231)
ax=sns.countplot(x="gender", data=recruitment_processed)
ax.set_xticklabels(ax.get_xticklabels(),fontsize=12)

#Higher secondary specialisation
plt.subplot(232)
ax=sns.countplot(x="hsc_s", data=recruitment_processed)
ax.set_xticklabels(ax.get_xticklabels(),fontsize=12)

#Degree type
plt.subplot(233)
ax=sns.countplot(x="degree_t", data=recruitment_processed)
ax.set_xticklabels(ax.get_xticklabels(),fontsize=12)

#Specialisation
plt.subplot(234)
ax=sns.countplot(x="specialisation", data=recruitment_processed)
ax.set_xticklabels(ax.get_xticklabels(),fontsize=12)

#Work experience
plt.subplot(235)
ax=sns.countplot(x="workex", data=recruitment_processed)
ax.set_xticklabels(ax.get_xticklabels(),fontsize=12)

#Status
plt.subplot(236)
ax=sns.countplot(x="status", data=recruitment_processed)
ax.set_xticklabels(ax.get_xticklabels(),fontsize=12)

### Inference:
1. There are twice number of Males compared to Female, which may show inferene that more Males are hired than Female
2. More candidates are from "commerce" background.
3. A large number of candidates have no prior work experience.
4. Count of placed candidates is more than Unplaced.

In [None]:
sns.pairplot(recruitment_processed,vars=['ssc_p','hsc_p','degree_p','mba_p','etest_p'],hue="status")

### Inferece:
1. Candidates who scored good in SSC and HSC got placed more.
2. MBA Percentage did not make a good contributor for Hiring Process as number of Students unplaced are more than Placed who completed MBA.

### 5. Preprocessing data for Model Building

In [None]:
## Check categorical columns
categorical_columns

In [None]:
recruitment_processed[categorical_columns].head()

If we apply label encoding to "hsc_s","degree_t" and "specialiation" columns, it will not make any sense as they will be numbered and then they can not be distinguished unless we know exact code for that field, hence excluding these column from label encoding. 

In [None]:
column_to_be_encoded = ['gender','workex','status']

In [None]:
# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in column_to_be_encoded:
    recruitment_processed[col] = label_encoder.fit_transform(recruitment_processed[col])
recruitment_processed.head()

Now we will create dummies for remaining columns with their name as prefix which will help in identifying the field.

In [None]:
## Creating dummies

dummies=pd.get_dummies(recruitment_processed[['hsc_s','degree_t','specialisation']])
recruitment_final = pd.concat([recruitment_processed,dummies],axis=1)
recruitment_final.drop(['hsc_s','degree_t','specialisation'],axis=1, inplace=True)
recruitment_final.head()

### 6. Train-Test Split

In [None]:
X = recruitment_final.drop(['status','salary'], axis=1)
y = recruitment_final['status']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7, test_size=0.3, random_state=100)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

### 7. Model Building

#### 7A. Logistic Regression

In [None]:
# Import the StandardScaler()
from sklearn.preprocessing import StandardScaler

# Create a scaling object
scaler = StandardScaler()

# Create a list of the variables that you need to scale
varlist = ['ssc_p', 'hsc_p', 'degree_p','etest_p','mba_p']#, #'Asymmetrique Activity Score',
       #'Asymmetrique Profile Score']

# Scale these variables using 'fit_transform'
X_train[varlist] = scaler.fit_transform(X_train[varlist])

In [None]:

import statsmodels.api as sm

# Logistic regression model
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

### Using RFE for feature selection

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [None]:
from sklearn.feature_selection import RFE
rfe = RFE(logreg, 10)             # running RFE with 13 variables as output
rfe = rfe.fit(X_train, y_train)

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
## drop column whose VIF is more than 5
col =col.drop(['degree_t_Comm&Mgmt'])
col

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res2 = logm3.fit()
res2.summary()

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

P values for all the features is less than 0.5 and VIF is less than 3 which is ideal case.Hence we will be using this model for predictions. 

In [None]:

# Getting the predicted values on the train set
y_train_pred = res2.predict(X_train_sm)
y_train_pred[:10]

In [None]:

y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

Creating a dataframe with the actual converted flag and the predicted probabilities

In [None]:
y_train_pred_final = pd.DataFrame({'status':y_train.values, 'status_Prob':y_train_pred})
y_train_pred_final['ID'] = y_train.index
y_train_pred_final.head()

In [None]:
y_train_pred_final['Status_predicted'] = y_train_pred_final.status_Prob.map(lambda x: 1 if x > 0.5 else 0)

# Let's see the head
y_train_pred_final.head(20)

In [None]:

from sklearn import metrics
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.status, y_train_pred_final.Status_predicted )
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.status, y_train_pred_final.Status_predicted))

In [None]:
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

### Plotting the ROC Curve
An ROC curve demonstrates several things:
1. It shows the tradeoff between sensitivity and specificity (any increase in sensitivity will be accompanied by a decrease in specificity).
2. The closer the curve follows the left-hand border and then the top border of the ROC space, the more accurate the test.
3. The closer the curve comes to the 45-degree diagonal of the ROC space, the less accurate the test.

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.status, y_train_pred_final.status_Prob, drop_intermediate = False )

In [None]:
draw_roc(y_train_pred_final.status, y_train_pred_final.status_Prob)

### Finding Optimal Cutoff Point

In [None]:

# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.status_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
 #Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.status, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

#### From the curve above, 0.7 is the optimum point to take it as a cutoff probability

In [None]:
y_train_pred_final['Status_predicted'] = y_train_pred_final.status_Prob.map(lambda x: 1 if x > 0.7 else 0)

# Let's see the head
y_train_pred_final.head()

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_train_pred_final.status, y_train_pred_final.Status_predicted)

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
from sklearn import metrics
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.status, y_train_pred_final.Status_predicted )
print(confusion)

From the ROC curve we can infer that our logistic model has classified the placed students correctly rather than predicting false positive. 
The more the ROC curve(blue) lies towards the top left side (upper side) the better our model is. 
To imporve positive prediction We can choose 0.8 or 0.9 for the threshold value which can reap us true positive result. 