# Campus Recruitment

# Importing Essentials

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,confusion_matrix

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Meeting the data

In [None]:
df=pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')
df.shape

In [None]:
df.head()

In [None]:
df.info()

The salary column contains null values which are mostly of the people whose status is unplaced.

In [None]:
df.describe().T

# Exploratory Data Analysis

# Looking for relationships

In [None]:
plt.figure(figsize=(10,6))
df.corr()['salary'][:-1].sort_values().plot.bar()

The better the mba and etest percentage, the better the salary

In [None]:
sns.lmplot('mba_p', 'salary', data=df)

In [None]:
sns.lmplot('etest_p', 'salary', data=df)

# Heatmap

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(),annot=True,cmap='viridis')

## Placed vs Unplaced

In [None]:
sns.countplot(df['status'])

## Distribution of salary

In [None]:
sns.distplot(df['salary'])

The distribution lies between 2,00,000 and 4,00,000 Anything more than that can be considered an outlier

# Male vs Female

In [None]:
sns.countplot(df['gender'])

In [None]:
sns.countplot(df['gender'],hue=df['status'])

In [None]:
sns.countplot(df['gender'],hue=df['workex'])

# SSC

## Distribution

In [None]:
df['ssc_p'].plot.hist(bins=50)

Max percentages lie between 60 and 70

## What about the SSC Board?

In [None]:
fig,ax=plt.subplots(2,2,figsize=(16,8))
_=sns.countplot(df['ssc_b'],ax=ax[0,0])
_=sns.countplot(df['ssc_b'],hue=df['gender'],ax=ax[0,1])
_=sns.countplot(df['ssc_b'],hue=df['workex'],ax=ax[1,0])
_=sns.countplot(df['ssc_b'],hue=df['status'],ax=ax[1,1])

* More people opt for central board.
* Both Central and Others have higher number of male students.
* Central board students have more work experience.
* Cntral board students have more students placed.

In [None]:
plt.figure(figsize=(12,5))
sns.lineplot(df['ssc_p'],df['salary'],hue=df['ssc_b'])

Other boards have more or less normalized distribution whereas Cnetral board shows a sudden peak at 60 and 70

# HSC

## Distribution

In [None]:
df['hsc_p'].plot.hist(bins=60)

The distribution is almost similar to ssc where max percentages lie between 60 and 70

## What about the HSC Board?

In [None]:
fig,ax=plt.subplots(2,2,figsize=(16,8))
_=sns.countplot(df['hsc_b'],ax=ax[0,0])
_=sns.countplot(df['hsc_b'],hue=df['gender'],ax=ax[0,1])
_=sns.countplot(df['hsc_b'],hue=df['workex'],ax=ax[1,0])
_=sns.countplot(df['hsc_b'],hue=df['status'],ax=ax[1,1])

* More people opt for other board.
* Both Central and Others have higher number of male students.
* Other board students have more work experience.
* Other board students have more students placed.

## HSC Stream

In [None]:
fig,ax=plt.subplots(1,3,figsize=(16,5))
_=sns.countplot(df['hsc_s'],ax=ax[0])
_=sns.countplot(df['hsc_s'],hue=df['hsc_b'],ax=ax[1])
_=sns.countplot(df['hsc_s'],hue=df['status'],ax=ax[2])

Most of the students opt for commerce then come science and the arts

In [None]:
plt.figure(figsize=(12,5))
sns.lineplot(df['hsc_p'],df['salary'],hue=df['hsc_b'])

# Degree

## Distribution 

In [None]:
df['degree_p'].plot.hist(bins=60)

Max percentages lie between 65 to 70

## Stream

In [None]:
fig,ax=plt.subplots(2,2,figsize=(16,8))
_=sns.countplot(df['degree_t'],ax=ax[0,0])
_=sns.countplot(df['degree_t'],hue=df['gender'],ax=ax[0,1])
_=sns.countplot(df['degree_t'],hue=df['workex'],ax=ax[1,0])
_=sns.countplot(df['degree_t'],hue=df['status'],ax=ax[1,1])

* More people opt for Commerce and Management.
* Others have higher number of female students.

In [None]:
plt.figure(figsize=(12,5))
sns.lineplot(df['degree_p'],df['salary'],hue=df['degree_t'])

# Etest

## Distribution

In [None]:
df['etest_p'].plot.hist(bins=30)

In [None]:
plt.figure(figsize=(12,5))
sns.lineplot(df['etest_p'],df['salary'],hue=df['workex'])

It's obvious 'people who have more work eperience have better salaries

# MBA

## Distribution

In [None]:
df['mba_p'].plot.hist(bins=60)

## MBA Specialisation

In [None]:
fig,ax=plt.subplots(2,2,figsize=(16,8))
_=sns.countplot(df['specialisation'],ax=ax[0,0])
_=sns.countplot(df['specialisation'],hue=df['gender'],ax=ax[0,1])
_=sns.countplot(df['specialisation'],hue=df['workex'],ax=ax[1,0])
_=sns.countplot(df['specialisation'],hue=df['status'],ax=ax[1,1])

* More people opt for marketing an finance
* Both have more number of male participants.
* Mkt and HR partiipants have very low work experience,.
* Mkt and Fin have more participants placed.

In [None]:
plt.figure(figsize=(12,5))
sns.lineplot(df['mba_p'],df['salary'],hue=df['workex'])

# Feature Engineering

In [None]:
df.drop(['sl_no','ssc_b','hsc_b','salary'],axis=1,inplace=True)

## Imputing strings with integers

In [None]:
df['gender']=df['gender'].map({'M':1,'F':0})
df['hsc_s']=df['hsc_s'].map({'Commerce':1,
                            'Science':0,
                            'Arts':2})
df['degree_t']=df['degree_t'].map({'Sci&Tech':0,
                                  'Comm&Mgmt':1,
                                  'Others':2})
df['workex']=df['workex'].map({'Yes':1,
                              'No':0})
df['specialisation']=df['specialisation'].map({'Mkt&HR':0,
                                              'Mkt&Fin':1})
df['status']=df['status'].map({'Placed':1,
                              'Not Placed':0})

## Dummy variables

In [None]:
df=pd.get_dummies(df,columns=['hsc_s','degree_t'],
                   drop_first=True)

In [None]:
X=df.drop('status',axis=1)
y=df['status']

## Scaling the data

In [None]:
scaler=StandardScaler()
X=scaler.fit_transform(X)

## Train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Logistic Regression

In [None]:
regressor=LogisticRegression()
regressor.fit(X_train,y_train)
prediction=regressor.predict(X_test)
acc_lr=accuracy_score(y_test,prediction)
print('Accuracy score is {}'.format(acc_lr))
print(confusion_matrix(y_test,prediction))

# KNeighborsClassifier

In [None]:
classifier_knn=KNeighborsClassifier()
classifier_knn.fit(X_train,y_train)
prediction=classifier_knn.predict(X_test)
acc_knn=accuracy_score(y_test,prediction)
print('Accuracy score is {}'.format(acc_knn))
print(confusion_matrix(y_test,prediction))

# DecisionTreeClassifier

In [None]:
classifier_dtr=DecisionTreeClassifier()
classifier_dtr.fit(X_train,y_train)
prediction=classifier_dtr.predict(X_test)
acc_dtr=accuracy_score(y_test,prediction)
print('Accuracy score is {}'.format(acc_dtr))
print(confusion_matrix(y_test,prediction))

# RandomForestClassifier

In [None]:
classifier_rfc=RandomForestClassifier()
classifier_rfc.fit(X_train,y_train)
prediction=classifier_rfc.predict(X_test)
acc_rfc=accuracy_score(y_test,prediction)
print('Accuracy score is {}'.format(acc_rfc))
print(confusion_matrix(y_test,prediction))

# XGBClassifier

In [None]:
classifier_xgbc=XGBClassifier()
classifier_xgbc.fit(X_train,y_train)
prediction=classifier_xgbc.predict(X_test)
acc_xgbc=accuracy_score(y_test,prediction)
print('Accuracy score is {}'.format(acc_xgbc))
print(confusion_matrix(y_test,prediction))

# GaussianNB

In [None]:
classifier_gnb=GaussianNB()
classifier_gnb.fit(X_train,y_train)
prediction=classifier_gnb.predict(X_test)
acc_gnb=accuracy_score(y_test,prediction)
print('Accuracy score is {}'.format(acc_gnb))
print(confusion_matrix(y_test,prediction))

# Support Vector Machine

In [None]:
classifier_svc=SVC(probability=True)
classifier_svc.fit(X_train,y_train)
prediction=classifier_svc.predict(X_test)
acc_svc=accuracy_score(y_test,prediction)
print('Accuracy score is {}'.format(acc_svc))
print(confusion_matrix(y_test,prediction))

# Sorting models by accuracy score

In [None]:
model=pd.DataFrame({'Model':['LogisticRegression','KNeighborsClassifier',
                             'RandomForestClassifier','DecisionTreeClassifier'
                            ,'SVM','GaussianNB','XGBClassifier'],
                  'Score':[acc_lr,acc_knn,acc_rfc,acc_dtr,acc_svc,acc_gnb,acc_xgbc]})
model.sort_values('Score',ascending=False)

# Conclusion

* There are more male participants.
* More people opt for Central board for SSC and Others for HSC
* More people choose Commerce and Management stream for degree
* More people choose Mkt&Fin stream for MBA
* Work experience will lend you better salary.
* MBA percentage and etest percentage effects salary.


# Like my work??

#### Please do upvote and leave a comment if any suggetions