# Campus Recruitment (Placements)

## Importing Essential Libraries and Reading the Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')
data.drop('sl_no',axis = 1, inplace = True)

## Data Exploration

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
mask = pd.isnull(data['salary'])

In [None]:
data[mask].head(20)

#### Notes:
1. Only Salary column has null values(NaN) [Need to consider filling it later if required]
2. All the NaN comes from the students who are unemployed

## Feature wise Visualization
## Male vs Female
#### What questions can be asked:
1. Visualizing status of placement based on gender
2. How does salary differ based on gender?

In [None]:
sns.countplot("gender", hue="status", data=data)
plt.xlabel('Gender')
plt.ylabel('Count')
fig = plt.gcf()
fig.set_size_inches(10, 7)

In [None]:
pd.crosstab(index = data['gender'], columns = data['status'])

In [None]:
# Calculating percentages
pd.crosstab(index = data['gender'], columns = data['status']).apply(lambda r: 100 * r/r.sum(), axis=1)

In [None]:
sns.swarmplot(x='gender',y='salary',data=data)
plt.xlabel('Gender')
plt.ylabel('Salary')
fig = plt.gcf()
fig.set_size_inches(8, 7)

In [None]:
sns.kdeplot(data['salary'][data['gender']=='M'])
sns.kdeplot(data['salary'][data['gender']=='F'])
plt.xlabel('Salary (100K)')
plt.ylabel('Count')
plt.legend(['Male','Female'])
fig = plt.gcf()
fig.set_size_inches(10, 7)

#### Notes:
1. Clearly there are more males than females 
2. Number of males placed are higher than females
3. Males recieve higher salary than females

## Secondary School Marks and Board (ssc_p and ssc_b)
#### What questions can be asked:
1. How does ssc marks differ between the gender?
2. How does ssc marks affect the status of placement?
3. How does ssc marks affect the salary?
3. How does ssc board affect status of placement?
4. How does ssc board affect salary?

In [None]:
sns.kdeplot(data['ssc_p'][data['gender']=='M'])
sns.kdeplot(data['ssc_p'][data['gender']=='F'])
plt.xlabel('Secondary School Percentage')
plt.ylabel('Count')
plt.legend(['Male','Female'])
fig = plt.gcf()
fig.set_size_inches(10, 7)

In [None]:
sns.kdeplot(data['ssc_p'][data['status']=='Placed'])
sns.kdeplot(data['ssc_p'][data['status']=='Not Placed'])
plt.xlabel('Secondary School Percentage')
plt.ylabel('Count')
plt.legend(['Placed','Not Placed'])
fig = plt.gcf()
fig.set_size_inches(10, 7)

In [None]:
sns.kdeplot(data['salary'][data['ssc_b']=='Central'])
sns.kdeplot(data['salary'][data['ssc_b']=='Others'])
plt.xlabel('Salary (100K)')
plt.ylabel('Count')
plt.legend(['Central','Others'])
fig = plt.gcf()
fig.set_size_inches(10, 7)

In [None]:
sns.countplot('ssc_b',hue = 'status',data=data)
plt.xlabel('Secondary School Board')
plt.ylabel('Count')
fig = plt.gcf()
fig.set_size_inches(10, 7)

In [None]:
pd.crosstab(index = data['ssc_b'], columns = data['status'])

In [None]:
sns.relplot(x='ssc_p', y='salary', hue = 'gender', kind = 'line', data=data)
plt.xlabel('Secondary School Percentage')
plt.ylabel('Salary')
fig = plt.gcf()
fig.set_size_inches(12, 6)

#### Notes:
1. Males have a generally lower secondary school percentage as compared to females
2. Students who have been placed have a higher secondary percentage
3. Secondary Board does not affect placement
4. There is no noticable correlation between secondary school marks and salary earned
5. Boards do not have a significant impact on the salary

## Higher Secondary School Marks, Board and Stream (hsc_p, hsc_b ,hsc_s)
#### Questions to ask:
1. How does hsc marks differ between the genders?
2. How does hsc marks affect placement status?
3. How does hsc marks affect salary?
4. How does hsc board affect placement status?
5. How does hsc board affect salary?
6. How does hsc stream affect hsc marks?
7. How does hsc stream affect placement status?
8. How does hsc stream affect salary?
9. How does hsc stream differ between genders? (May not affect our results but interesting to know)

In [None]:
sns.kdeplot(data['hsc_p'][data['gender']=='M'])
sns.kdeplot(data['hsc_p'][data['gender']=='F'])
plt.xlabel("Higher Secondary Percentage")
plt.ylabel("Count")
plt.legend(["Male","Female"])
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
sns.kdeplot(data['hsc_p'][data['status']=="Placed"])
sns.kdeplot(data['hsc_p'][data['status']=='Not Placed'])
plt.xlabel('Higher Secondary Percentage')
plt.ylabel('Count')
plt.legend(["Placed", "Not Placed"])
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
sns.relplot(x='hsc_p', y='salary', hue = 'hsc_s', kind = 'line', data=data)
plt.xlabel('Higher Secondary Percentage')
plt.ylabel('Salary')
fig = plt.gcf()
fig.set_size_inches(12, 6)

In [None]:
sns.countplot('hsc_b',hue = 'status',data=data)
plt.xlabel('Higher Secondary Board')
plt.ylabel('Count')
fig = plt.gcf()
fig.set_size_inches(10, 7)

In [None]:
sns.kdeplot(data['salary'][data['hsc_b']=='Central'])
sns.kdeplot(data['salary'][data['hsc_b']=='Others'])
plt.xlabel('Salary (100K)')
plt.ylabel('Count')
plt.legend(['Central','Others'])
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
sns.countplot('hsc_s', hue = 'gender', data=data)
plt.xlabel('Higher Secindary Stream')
plt.ylabel('Count')
fig = plt.gcf()
fig.set_size_inches(10,7)

The ratio of number of male to female students is 2:1 in all boards except Arts

In [None]:
pd.crosstab(index = data['hsc_s'], columns = data['gender'])

In [None]:
stream = list(data['hsc_s'].unique())
for s in stream:
    sns.kdeplot(data['hsc_p'][data['hsc_s']==s])
plt.xlabel('Higher Seconary Percentage')
plt.ylabel('Count')
plt.legend(stream)
fig = plt.gcf()
fig.set_size_inches(10,7)

We see that Commerce students score slightly higher marks on an average as compared to the other boards

In [None]:
sns.countplot('hsc_s', hue = 'status', data=data)
plt.xlabel('Higher Seconary Stream')
plt.ylabel('Count')
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
stream = list(data['hsc_s'].unique())
for s in stream:
    sns.kdeplot(data['salary'][data['hsc_s']==s])
plt.xlabel('Salary (100K)')
plt.ylabel('Count')
plt.legend(stream)
fig = plt.gcf()
fig.set_size_inches(10,7)

#### Notes:
1. No significant difference between the male students and female students wrt marks distribution
2. Those students who have been placed have a slightly higher HSC score than the unplaced students
3. There is no significant relation between salary and HSC marks
4. Arts students have recieved lesser salary as compared to a commerce or science student.
5. HSC board does not affect placement status (about 66% placement wrt each board)
6. HSC board does not affect salary 
7. HSC Stream does not seem to affect placement status

## Degree Marks and Stream (degree_p, degree_t)
#### Questions to ask:
1. How does degree marks affect placement status?
2. How does degree marks affect salary?
3. How does stream affect placement status?
4. How does stream affect salary?

In [None]:
sns.kdeplot(data['degree_p'][data['status']=='Placed'])
sns.kdeplot(data['degree_p'][data['status']=='Not Placed'])
plt.legend(['Placed','Not Placed'])
plt.xlabel('Degree Percentage')
plt.ylabel('COunt')
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
sns.relplot(x='degree_p', y = 'salary', kind = 'line', hue = 'degree_t', data = data)
plt.xlabel('Degree Percentage')
plt.ylabel('Salary (100K)')
fig = plt.gcf()
fig.set_size_inches(12,6)

In [None]:
sns.countplot('degree_t', hue = 'status', data=data)
plt.xlabel('Degree Type')
plt.ylabel('Count')
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
pd.crosstab(index = data['degree_t'], columns = data['status']).apply(lambda r: r/r.sum(), axis=1)

In [None]:
for i in data['degree_t'].unique():
    sns.kdeplot(data['salary'][data['degree_t'] == i])
plt.legend(data.degree_t.unique())
plt.xlabel('Salary (100K)')
plt.ylabel('Count')
fig = plt.gcf()
fig.set_size_inches(10,7)

#### Notes:
1. Again as expected students who have been placed have a higher degree score
2. There is no significant correlation between the marks and salararies offered.
3. Although some of the highest salaries were offered to Comm&Mgmt sudents
4. As we can see, approximately 70% of students from Sci&Tech and Comm&Mgmt have been placed while there are too less students in Others category to judge properly.

## Work Experience (workex)
#### Questions to ask:
1. How does work experience affect placement status?
2. How does work experience affect salaries offered?

In [None]:
sns.countplot('workex', hue ='status', data=data)
plt.xlabel('Work Experience')
plt.ylabel('Count')
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
sns.kdeplot(data['salary'][data['workex']=='Yes'])
sns.kdeplot(data['salary'][data['workex']=='No'])
plt.legend(['Yes','No'])
plt.xlabel('Salary (100K)')
plt.ylabel('Count')
fig = plt.gcf()
fig.set_size_inches(10,7)

#### Notes:
1. We see that most students with work experience are placed
2. Students with work experience are getting the top salaries.

## Employability Test (etest_p)
#### Questions to ask:
1. Finding the variation of employability test wrt the secondary, higher secondary and degree marks
1. Does the test affect placement status?
2. Does the test affect salary?

In [None]:
sns.lmplot(x = 'ssc_p', y = 'etest_p', hue = 'ssc_b', data=data)
plt.xlabel('Seconary School Percentage')
plt.ylabel('Employability Test')
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
sns.lmplot(x = 'hsc_p', y = 'etest_p', hue = 'hsc_s', data=data)
plt.xlabel('Higher Seconary Percentage')
plt.ylabel('Employability Test')
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
sns.lmplot(x = 'degree_p', y = 'etest_p', hue = 'degree_t', data=data)
plt.xlabel('Degree Percentage')
plt.ylabel('Employability Test')
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
sns.lmplot(x = 'mba_p', y = 'etest_p', hue = 'specialisation', data=data)
plt.xlabel('MBA Percentage')
plt.ylabel('Employability Test')
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
sns.kdeplot(data['etest_p'][data['status'] == 'Placed'])
sns.kdeplot(data['etest_p'][data['status'] == 'Not Placed'])
plt.legend(['Placed', 'Not Placed'])
plt.xlabel('Employabilty Test')
plt.ylabel('Count')
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
sns.relplot(x = 'etest_p', y = 'salary', kind = 'line', data = data)
plt.xlabel('Employability Test')
plt.ylabel('Salary')
fig = plt.gcf()
fig.set_size_inches(12,6)

#### Notes:
1. Students who have been placed have scored higher in their test, but not a lot of difference.
2. The test score has almost no affect on the salary

## Specialisation and MBA Marks (specialisation and mba_p)
#### Questins to ask:
1. How does MBA score vary with specialisation?
1. How does specialisation affect placement status?
2. How does specialisation affect Salary?
3. How does MBA socre affect placement status?
4. How does MBA score affect  salary?

In [None]:
sns.kdeplot(data['mba_p'][data['specialisation'] == 'Mkt&HR'])
sns.kdeplot(data['mba_p'][data['specialisation'] == 'Mkt&Fin'])
plt.legend(['Mkt&HR','Mkt&Fin'])
plt.xlabel('MBA Percentage')
plt.ylabel('Count')
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
sns.boxplot('mba_p','specialisation',data=data)
plt.xlabel('MBA Percentage')
plt.ylabel('Specialisation')
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
sns.countplot('specialisation', hue = 'status', data=data)
plt.xlabel('Specialisation')
plt.ylabel('Count')
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
# Finding percentages
pd.crosstab(index = data['status'], columns = data['specialisation']).apply(lambda r: 100 * r/r.sum(), axis=1)

In [None]:
pd.crosstab(index = data['specialisation'], columns = data['status']).apply(lambda r:100 *  r/r.sum(), axis=1)

In [None]:
sns.kdeplot(data['salary'][data['specialisation'] == 'Mkt&HR'])
sns.kdeplot(data['salary'][data['specialisation'] == 'Mkt&Fin'])
plt.legend(['Mkt&HR','Mkt&Fin'])
plt.xlabel('Salary (100K)')
plt.ylabel('Count')
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
sns.kdeplot(data['mba_p'][data['status'] == 'Placed'])
sns.kdeplot(data['mba_p'][data['status'] == 'Not Placed'])
plt.legend(['Placed','Not Placed'])
plt.xlabel('MBA Percentage')
plt.ylabel('Count')
fig = plt.gcf()
fig.set_size_inches(10,7)

In [None]:
sns.relplot(x = 'mba_p', y ='salary', kind = 'line', hue = 'specialisation', data = data)
plt.xlabel('MBA Percentage')
plt.ylabel('Salary')
fig = plt.gcf()
fig.set_size_inches(18,6)

#### Notes:
1. Mkt&Fin students have scored slightly higher as compared to Mkt&HR
2. **Of all the students placed** a large number of them came from Marketing and Finance.
3. More students within Marketing and Finance specialisation were placed as compared to Marketing and HR
4. Clearly students with Mtk&Fin specialisation are placed with the highest salaries while generally the salary distribution is almost same
4. Placement status does not vary much with MBA Score

## Data Preprocessing

In [None]:
# Dropping salary column as we do not require it for classification
data.drop('salary', axis = 1, inplace = True)

# Dropping the boards as it was not an important factor in determining placement status
data.drop(['ssc_b','hsc_b'],axis = 1, inplace = True)
data.head()

In [None]:
# Encoding our categorical values
data["gender"] = data.gender.map({'M':0,'F':1})
data["workex"] = data.workex.map({'No':0, 'Yes':1})
data['specialisation'] = data.specialisation.map({'Mkt&Fin':0,'Mkt&HR':1})
data['status'] = data.status.map({'Not Placed':0,'Placed':1})

In [None]:
for column in ['hsc_s', 'degree_t']:
    dummies = pd.get_dummies(data[column])
    data[dummies.columns] = dummies

In [None]:
data.drop(['hsc_s', 'degree_t'],axis = 1, inplace = True)

In [None]:
# Removing one of the three options from 'hsc_s' and one from 'Specialisation' to avoid dummy variable trap
data.drop(['Arts', 'Others'], axis = 1,inplace = True)
data.head()

In [None]:
X = data.copy()
y = data['status']
X.drop('status', axis = 1, inplace = True)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.25, random_state = 3)

## Building our Model

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
dt = tree.DecisionTreeClassifier(random_state = 1)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
rf = RandomForestClassifier(random_state = 1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
svc = SVC(probability = True)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state =1)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators = [('lr',lr),('knn',knn),('rf',rf),('gnb',gnb),('svc',svc),('xgb',xgb)], voting = 'soft')
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)
accuracy_score(y_test, y_pred)

### Our Results are as follows:
* Gaussian Naive Bayes: 85.2%
* Logistic Regression: 90.7%
* Decision Tree: 75.9%
* K Nearest Nighbor: 87.03%
* Random Forest: 85.1%
* Support Vector Classifier: 92.5%
* Xtreme Gradient Boosting: 85.1%
* Soft Voting Classifier - All Models: 88.8%

### Thanks for reading. This is my first notebook! Feel free to upvote if you liked it!