In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
data = pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv', index_col='sl_no')
data.head()

In [None]:
data.info()

* Salary has missing data values for students who have not been placed
* All other columns have no missing values

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
sns.set(style='whitegrid', palette='muted', font_scale=1.1)

# Heat Map


In [None]:
data2 = data.copy()
data2['status'] = data2['status'].map({'Placed':1, 'Not Placed': 0}).astype(int)
plt.figure(figsize=(14,7))
plt.title('Heatmap')
sns.heatmap(data=data2.drop('salary', axis=1).corr(), annot=True)

* High correlation between status and (ssc_p, hsc_p, degree_p)
* Low correlation between status and (etest_p, mba_p)

# Relation between different marks


In [None]:
sns.pairplot(data, vars=['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p'], hue='status', kind='reg')

* For pairs of (ssc_p v/s hsc_p), (ssc_p v/s degree_p), (hsc_p v/s degree_p) it can be estabilished that good performance in all three fronts relate to higher chances of getting placed
* The other two columns do not point to any such correlation. The marks obtained in them seem to have no influence on the placement

# Gender

In [None]:
data_placed=data.loc[data.status=="Placed"]
data_unplaced=data.loc[data.status=="Not Placed"]
temp_df=data_placed.loc[data_placed.gender=="F"]
Female_placed=temp_df.shape[0]
Male_placed=data_placed.shape[0]-Female_placed

temp_df=data_unplaced.loc[data_unplaced.gender=="M"]
Male_unplaced=temp_df.shape[0]
Female_unplaced=data_unplaced.shape[0]-Male_unplaced
t_male=(Male_placed+Male_unplaced)
t_female=(Female_unplaced+Female_placed)
t_student=t_male+t_female
data_of_gender={"Particular": [" Males Placed","Female Placed","Male Unplaced","Female Unplaced"] ,
                "No. of Student" : [Male_placed,Female_placed,Male_unplaced,Female_unplaced],
                "Student per Gender":[Male_placed/t_male,Female_placed/t_female,Male_unplaced/t_male,Female_unplaced/t_female],
                "Student per student":[Male_placed/t_student,Female_placed/t_student,Male_unplaced/t_student,Female_unplaced/t_student]
               }
gender_df=pd.DataFrame(data_of_gender,columns=["Particular","No. of Student","Student per Gender","Student per student"])
plt.figsize=(12,6)
sns.swarmplot(x=data["gender"],y=data["etest_p"],hue=data['status'])
plt.title("Placed students etest vs Gender")

In [None]:
plt.figsize=(12,6)
sns.barplot(x=gender_df["Particular"],y=gender_df["Student per Gender"])

In [None]:

sns.scatterplot(x=data['gender'],y=data["salary"])

* Males have a slighly higher probabilty of getting placed when compared to females
* 71.9% males get placed compared to 63% female


# Senior Secondary

In [None]:
plt.figure(figsize=(12,6))
sns.swarmplot(x=data['status'], y=data['ssc_p'], hue=data['ssc_b'])

In [None]:
data2[['ssc_b','status']].groupby(['ssc_b'], as_index=False).mean()

* No major diffrence in centrer board or other board w.r.t placement probability
* On an average, more percentage score in ssc increases chances of placement
* ssc_p > 80 are always placed
* ssc_p < 50 are not placed

# High School

In [None]:
plt.figure(figsize=(12,6))
sns.swarmplot(x=data['status'],y=data['hsc_p'],hue=data['hsc_b'])
data_unplaced.describe()

In [None]:
plt.figure(figsize=(12,6))
sns.swarmplot(x=data['hsc_s'],y=data['hsc_p'],hue=data['status'])
data_unplaced.describe()

In [None]:
temp_df=data.loc[data.hsc_b=="Others"]
plt.figure(figsize=(12,6))
sns.scatterplot(x=temp_df['hsc_p'],y=temp_df['salary'],hue=temp_df["gender"])
temp_df.describe()

In [None]:
temp_df=data.loc[data.hsc_b=="Central"]
plt.figure(figsize=(12,6))
sns.scatterplot(x=temp_df['hsc_p'],y=temp_df['salary'],hue=temp_df["gender"])
temp_df.describe()

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(x=data['hsc_p'],y=data['salary'],hue=data["hsc_b"])

* Max of high school percentage(hsc_p) hsc_p of Others Board is greater than Central Board 
* Percent of students placed almost equal at 70% irrepecting of their high school board(hsc_b) 
* Huge gender disparity in salary in Other board vs Not so much in Central board. 
* Clear from the visual data that lower % in high school(<55%) means no placement or very little chance of placement.
* Central board > 70% guarantees placement while in Other board >80% guarantees placement.
* Above >85% in science stream means guaranteed placement and <=55% means no placement.
* While >75% in Commerece guarantees placement
* Data too small for any comment on arts stream

# Bachelor's Degree

In [None]:
plt.figure(figsize=(12,6))
sns.swarmplot(x=data['status'], y=data['degree_p'], hue=data['degree_t'])

In [None]:
data2[['degree_t','status']].groupby(['degree_t'], as_index=False).mean()

* No major diffrence in Comm&Mgmt and Sci&Tech w.r.t placement probability. Others has less placement but very less data points so we treat it as inconclusive. 
* On an average, more percentage score in bachelors degree increases chances of placement
* degree_p > 80 are always placed
* degree_p < 55 are not placed

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x="workex",hue="status",data=data)

In [None]:
plt.figure(figsize=(12,6))
sns.swarmplot(x=data['workex'],y=data['salary'],hue=data['specialisation'])

* Workex vs Salary has no relation 
* If student has workex, chances of placement are very high.
* Suprisingly, students who do not have workex have still have 60% chance of placement.

# Employability Test

In [None]:
plt.figure(figsize=(12,6))
sns.swarmplot(x=data['status'], y=data['etest_p'])

In [None]:
plt.figure(figsize=(12,6))
sns.regplot(x=data['salary'], y=data['etest_p'])

* employability test score does not impact placement.
* Salary seems to increase with increase in employability test score.

# MBA

In [None]:
plt.figure(figsize=(12,6))
sns.swarmplot(x=data['status'],y=data['mba_p'])


In [None]:
data2[['specialisation','status']].groupby(['specialisation'], as_index=False).mean()

In [None]:
data2[['specialisation','salary']].groupby(['specialisation'], as_index=False).median()


* No conclusive relation between mba_p and placement
* Mkt&Fin professionals have a higher chance in getting a placement than Mkt&Hr. Infact, 79% of Mkt&Fin get placed compared to just 55% of Mkt&HR
* Mkt&Fin have a slighly higher median salary than Mkt&Hr

In [None]:
sns.catplot('degree_t', 'status', hue='specialisation', col='workex', 
            data=data2.loc[(data2['degree_t'] == 'Sci&Tech') | (data2['degree_t'] == 'Comm&Mgmt')], 
            kind='point')

* Mkt&Fin has better placements
* Students with workex have better placements
* Mkt&Fin coupled with Comm&Mgmt degree and prior work experience has almost 100% placement probability
* For others, data points are we can not really analyze anything from it.

# Model Building and Hosting
Please follow the link to the model building notebook
https://www.kaggle.com/aakashg1999/placement-prediction-model-building

Please the below link to the web app where the model is hosted
https://intense-castle-03852.herokuapp.com/