# Predicting whether or not a student will be placed (Logistic Regression)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
dataset = pd.read_excel('placement.xlsx')
dataset.head(10)

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,CGPA,degree_t,workex,etest_p,specialisation,Masters,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0
4,8,M,82.0,Central,64.0,Central,Science,66.0,Sci&Tech,Yes,67.0,Mkt&Fin,62.14,Placed,252000.0
5,9,M,73.0,Central,79.0,Central,Commerce,72.0,Comm&Mgmt,No,91.34,Mkt&Fin,61.29,Placed,231000.0
6,11,M,58.0,Central,61.0,Central,Commerce,60.0,Comm&Mgmt,Yes,62.0,Mkt&HR,60.85,Placed,260000.0
7,12,M,69.6,Central,68.4,Central,Commerce,78.3,Comm&Mgmt,Yes,60.0,Mkt&Fin,63.7,Placed,250000.0
8,14,F,77.0,Central,87.0,Central,Commerce,59.0,Comm&Mgmt,No,68.0,Mkt&Fin,68.63,Placed,218000.0
9,16,F,65.0,Central,75.0,Central,Commerce,69.0,Comm&Mgmt,Yes,72.0,Mkt&Fin,64.66,Placed,200000.0


In [5]:
dataset = dataset.drop('sl_no', axis=1)
dataset = dataset.drop('salary', axis=1)

In [14]:
dataset["gender"] = dataset["gender"].astype('category')
dataset["ssc_b"] = dataset["ssc_b"].astype('category')
dataset["hsc_b"] = dataset["hsc_b"].astype('category')
dataset["degree_t"] = dataset["degree_t"].astype('category')
dataset["workex"] = dataset["workex"].astype('category')
dataset["specialisation"] = dataset["specialisation"].astype('category')
dataset["status"] = dataset["status"].astype('category')
dataset["hsc_s"] = dataset["hsc_s"].astype('category')
dataset.dtypes

gender            category
ssc_p              float64
ssc_b             category
hsc_p              float64
hsc_b             category
hsc_s             category
CGPA               float64
degree_t          category
workex            category
etest_p            float64
specialisation    category
Masters            float64
status            category
dtype: object

In [15]:
dataset["gender"] = dataset["gender"].cat.codes
dataset["ssc_b"] = dataset["ssc_b"].cat.codes
dataset["hsc_b"] = dataset["hsc_b"].cat.codes
dataset["degree_t"] = dataset["degree_t"].cat.codes
dataset["workex"] = dataset["workex"].cat.codes
dataset["specialisation"] = dataset["specialisation"].cat.codes
dataset["status"] = dataset["status"].cat.codes
dataset["hsc_s"] = dataset["hsc_s"].cat.codes

dataset.head(20)

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,CGPA,degree_t,workex,etest_p,specialisation,Masters,status
0,1,67.0,1,91.0,1,1,58.0,2,0,55.0,1,58.8,1
1,1,79.33,0,78.33,1,2,77.48,2,1,86.5,0,66.28,1
2,1,65.0,0,68.0,0,0,64.0,0,0,75.0,0,57.8,1
3,1,85.8,0,73.6,0,1,73.3,0,0,96.8,0,55.5,1
4,1,82.0,0,64.0,0,2,66.0,2,1,67.0,0,62.14,1
5,1,73.0,0,79.0,0,1,72.0,0,0,91.34,0,61.29,1
6,1,58.0,0,61.0,0,1,60.0,0,1,62.0,1,60.85,1
7,1,69.6,0,68.4,0,1,78.3,0,1,60.0,0,63.7,1
8,0,77.0,0,87.0,0,1,59.0,0,0,68.0,0,68.63,1
9,0,65.0,0,75.0,0,1,69.0,0,1,72.0,0,64.66,1


In [16]:
X = dataset.iloc[:, :-1]
Y = dataset.iloc[:, -1]

Y

0      1
1      1
2      1
3      1
4      1
      ..
210    0
211    0
212    0
213    0
214    0
Name: status, Length: 215, dtype: int8

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

dataset.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,CGPA,degree_t,workex,etest_p,specialisation,Masters,status
0,1,67.0,1,91.0,1,1,58.0,2,0,55.0,1,58.8,1
1,1,79.33,0,78.33,1,2,77.48,2,1,86.5,0,66.28,1
2,1,65.0,0,68.0,0,0,64.0,0,0,75.0,0,57.8,1
3,1,85.8,0,73.6,0,1,73.3,0,0,96.8,0,55.5,1
4,1,82.0,0,64.0,0,2,66.0,2,1,67.0,0,62.14,1


In [22]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000).fit(X_train, Y_train)

clf.score(X_test, Y_test)

0.8837209302325582

In [23]:
Y_pred = clf.predict(X_test)
Y_pred

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0],
      dtype=int8)

In [24]:
from sklearn.metrics import confusion_matrix, accuracy_score

print(confusion_matrix(Y_test, Y_pred))

print(accuracy_score(Y_test, Y_pred))

[[10  1]
 [ 4 28]]
0.8837209302325582
