In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

%matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data=pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')
data.drop('sl_no',axis=1,inplace=True)
data.head()

# Exploratory Data Analysis

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.status.value_counts()

**Effect of Gender on placements**

In [None]:
data.gender.value_counts()

In [None]:
plt.figure()
sns.countplot(data.gender,hue=data.status)
plt.show()

* We see that there are 139 males and 67 females in total and placement of female candidates is comparitively lower than male candidates


**Effect of secondary education board on placements**

In [None]:
data.ssc_b.value_counts()

In [None]:
sns.countplot('ssc_b',hue='status',data=data)

plt.show()

* We see that there are more number of students from the central board compared to all other boards and the board of education doesn't have a significant effect on the placement status 

**Effect of Secondary Education Percentage on Placements**

In [None]:
sns.kdeplot(data.ssc_p[data.status=='Placed'])
sns.kdeplot(data.ssc_p[data.status=='Not Placed'])
plt.legend(['Placed','Not Placed'])
plt.xlabel('Secondary Education Percentage')
plt.show()

* We see that students with higher secondary education percentage above 65 have better chances of getting placed
* Minimum percentage for a student to get placed is 50%

**Effect of Higher Education Percentage on Placements**

In [None]:
sns.kdeplot(data.hsc_p[data.status=='Placed'])
sns.kdeplot(data.hsc_p[data.status=='Not Placed'])
plt.legend(['Placed','Not Placed'])
plt.xlabel('Higher Education Percentage')
plt.show()

* We see that students with better higher education percentage have better chances of getting placed
* Students having percentage above 70 get more placements while the number of students placed with percentage below 60 drops steeply 

**Effect of secondary education board on placements**

In [None]:
data.hsc_b.value_counts()

In [None]:
sns.countplot('hsc_b',hue='status',data=data)
plt.show()

* We see that most of the students are from other boards and the board of higher education doesn't effect the placements

**Effect of Specialization in higher secondary education on placements**

In [None]:
data.hsc_s.value_counts()

In [None]:
sns.countplot('hsc_s',hue='status',data=data)
plt.show()

* We see that the most common branch is Commerce and least common branch is Arts
* Approximately 69% of the total commerce students are placed
* Approximately 71% of the total science students are placed 
* Approximately 45% of the total arts students are NOT placed
* We conclude that science and commerce branch students have better chance at getting placed than arts students

**Effect of Degree Percentage on placements**

In [None]:
sns.kdeplot(data.degree_p[data.status=='Placed'])
sns.kdeplot(data.degree_p[data.status=='Not Placed'])
plt.legend(['Placed','Not Placed'])
plt.xlabel('Degree Percentage')
plt.show()

* More placements for students with percentage above 65
* Students should have a minimum of 50% to get placed

**Effect of degree type on placements**

In [None]:
data.degree_t.value_counts()

In [None]:
sns.countplot('degree_t',hue='status',data=data)
plt.show()

* We see that most of the students have opted for Commerce and Management
* Students of Science and Commerce are more likely to be placed as compared to other branches

 **Effect of Work experience on placements**

In [None]:
data.workex.value_counts()

In [None]:
sns.countplot('workex',hue='status',data=data)
plt.show()

* Students with higher work experience are more likely to get placed

**Effect of Employability test percentage on placement**

In [None]:
sns.kdeplot(data.etest_p[data.status=='Placed'])
sns.kdeplot(data.etest_p[data.status=='Not Placed'])
plt.legend(['Placed','Not Placed'])
plt.xlabel('Employability Test Percentage')
plt.show()

* Employability test percentage does not affect the placements much
* The students with 50-75% have lesser chance of getting placed

**Effect of Post Graduation Specialization**

In [None]:
data.specialisation.value_counts()

In [None]:
sns.countplot('specialisation',hue='status',data=data)
plt.show()

* The placements of Mkt&HR are lower compared to Mkt&Fin

**Effect of MBA Percentage**

In [None]:
sns.kdeplot(data.mba_p[data.status=='Placed'])
sns.kdeplot(data.mba_p[data.status=='Not Placed'])
plt.legend(['Placed','Not Placed'])
plt.xlabel('MBA Percentage')
plt.show()

* Better MBA percentage does not guaruntee placements

# Feature Encoding
We have gender,hsc_s, degree_t, workex, specialisation, status as categorical variables 

In [None]:
data['gender']=data.gender.map({'M':0,'F':1})
data['hsc_s']=data.hsc_s.map({'Commerce':0,'Science':1,'Arts':2})
data['degree_t']=data.degree_t.map({'Comm&Mgmt':0,'Sci&Tech':1,'Others':2})
data['workex']=data.workex.map({'Yes':0,'No':1})
data['specialisation']=data.specialisation.map({'Mkt&HR':0,'Mkt&Fin':1})
data['status']=data.status.map({'Placed':1,'Not Placed':0})
data['ssc_b']=data.ssc_b.map({'Central':0,'Others':1})
data['hsc_b']=data.hsc_b.map({'Central':0,'Others':1})

In [None]:
data.head()

Coorelation between the features

In [None]:
cor=data.corr()
plt.figure(figsize=(14,8))
sns.heatmap(cor,annot=True)

In [None]:
features=['gender','ssc_b','ssc_p','hsc_p','hsc_b','hsc_s','degree_p','degree_t','workex','etest_p','specialisation','mba_p']
X=data[features]
y=data['status']
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.3)
scaler=StandardScaler()
X_train_sc=scaler.fit_transform(X_train)
X_test_sc=scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()
clf.fit(X_train_sc,y_train)
predictions=clf.predict(X_test_sc)
score=accuracy_score(y_test,predictions)
score

In [None]:
from sklearn.svm import SVC
clf=SVC()
clf.fit(X_train_sc,y_train)
predictions=clf.predict(X_test_sc)
score=accuracy_score(y_test,predictions)
score

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier(random_state=0)
tree.fit(X_train_sc,y_train)
predictions=tree.predict(X_test_sc)
score=accuracy_score(y_test,predictions)
score

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfclf=RandomForestClassifier(random_state=0,n_estimators=100)
rfclf.fit(X_train_sc,y_train)
predictions=rfclf.predict(X_test_sc)
score=accuracy_score(y_test,predictions)
score

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier()
model.fit(X_train_sc,y_train)
predictions=model.predict(X_test_sc)
score=accuracy_score(y_test,predictions)
score

In [None]:
from sklearn.naive_bayes import GaussianNB
model=GaussianNB()
model.fit(X_train_sc,y_train)
predictions=model.predict(X_test_sc)
score=accuracy_score(y_test,predictions)
score

We get a maximum accuracy of 86% using Logisitc Regression