In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import io
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
sns.set(style = "ticks", palette = "inferno_r", font_scale = 1.2)

In [None]:
campus_data=pd.read_csv('../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')

In [None]:
campus_data.head()

In [None]:
campus_data.info()

1. There are 14 features. 
2. Status is the target variable that indicates if the student is placed or not.
3. There are 6 categorical variables and 7 Continuous variables 

In [None]:
campus_data.shape

There are 215 rows and 15 columns

In [None]:
campus_data.describe()

1. Average 10th grade percentage is 65% and the highest is 89.4%.
2. Average 12th grade percentage is 66% and the highest is 91%.
3. Aggregate degree percentage is 65% and the highest is 89.4%
2. Aggregate MBA percentage is 62% and the highest is 77.89%
2. Average salary a MBA student gets is 2.65 LPA 
6. Highest salary for a MBA student is 9.4 LPA 
7. Lowest is 2 LPA


In [None]:
campus_data.isnull().sum()

There are 67 missing values in salary. We'll check if there's any way to impute values.

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(campus_data['salary'],kde=True,color='green')
plt.show()

We can see that most MBA graduates have CTC around 2LPA to 3.5 LPA. 

In [None]:
plt.figure(figsize=(8,6))
sns.kdeplot(campus_data['salary'].loc[campus_data['gender']=='M'],color='blue')
sns.kdeplot(campus_data['salary'].loc[campus_data['gender']=='F'],color='red')
plt.legend(['Male','Female'])
plt.show()

Women are paid less comparetively to men according to this graph. 

Let's see if there's any reason for missing values


In [None]:
campus_data.groupby('status').salary.count()

We can see there is a reason for missing values. There are missing values only for observations where the status is 'Not Placed'

In [None]:
campus=campus_data.copy()
campus['salary'].loc[campus['salary'].isnull()]=0
campus['salary'].isnull().sum()

We'll impute null values in salary featue with 0 because these students are not placed. We cannot remove these observations because there are 148 null values and if we remove it'll affect the data during modelling.

In [None]:
campus['salary'].describe()

In [None]:
cat=campus.select_dtypes(include='object').columns
cont=campus.select_dtypes(exclude='object').columns
print(cat);print(cont)

CATEGORICAL FEATURES-['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation', 'status']

CONTINOUS FEATURES- ['sl_no', 'ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p' 'salary']

In [None]:
campus_data.groupby('gender').status.value_counts()

1. As we can see, there are total of 76 female students and 139 male students
2. 48 out of 76 female students are placed i.e Placed ratio is women 63%
3. 100 out of 139 male students are placed i.e Placed ratio for men is 71%

In [None]:
campus_data.groupby('ssc_b').status.value_counts()

1. 78 out of 116 students from Central Secondary Boards are placed
2. 70 out of 99 students from Other Secondary Boards(State,ICSE) are placed

In [None]:
campus_data.groupby('hsc_b').status.value_counts()

1. 57 out of 84 12th grade Central board students are placed
2. 91 out of 131 12th grade Other board students are placed

In [None]:
campus_data.groupby('hsc_s').status.value_counts()

1. 79 out of 113 students who took commerce in their 12th grade are placed
2. 63 out of 91 students who took science in their 12th grade are placed
3. Only 1 out of 6 Arts students got placed. 


In [None]:
campus_data.groupby('degree_t').status.value_counts()

1. UG Commerce studets have 70% placement ratio
2. UG Science and Tech students have 69% placement ratio

In [None]:
fig,axes=plt.subplots(2,4,figsize=(20,10))
sns.countplot('gender',hue='status',data=campus,ax=axes[0,0])
sns.countplot('ssc_b',hue='status',data=campus,ax=axes[0,1])
sns.countplot('hsc_b',hue='status',data=campus,ax=axes[0,2])
sns.countplot('hsc_s',hue='status',data=campus,ax=axes[0,3])
sns.countplot('degree_t',hue='status',data=campus,ax=axes[1,0])
sns.countplot('workex',hue='status',data=campus,ax=axes[1,1])
sns.countplot('specialisation',hue='status',data=campus,ax=axes[1,2])
fig.delaxes(axes[1,3])
plt.show()



1.   Comparitively Male students have better placement record than men  
2.   Other School Board students have 70% placement percentage where as Central Board students have 67% placement percentage [10th grade]
1.   Students who studied in central board [67%] and other board[69] in 12th grade have almost same placement percentage giving slight edge to Other Board students
2.   12th grade Commerce students and 12th grade Science students have almost same placement percentage [ 69% ]
2. UG Commerce students and UGScience students have almost same placement percentage [ 69% ]. But in the graph it looks like Commerce students are placed more but it's because there are more number of commerce students 
2.   People with no experience are more likely to get placed acc to the data. What we can infer is that students who have taken academic break after their UG are less likely to get placed. Though this is just a simple assumption 
1.   Marketing and finance students are more likely to be placed .





> # GENDER

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x='gender',data=campus)

There are 139 Men and 76 women


We'll now check how many men and women are placed

In [None]:
import matplotlib as mpt
mpt.rcParams['figure.figsize'] = (8.0, 6.0)

In [None]:
sns.countplot('gender',hue='workex',data=campus)

most of the people dont have work experience ,
37% of men have work experience ,
28% of women have work experience


In [None]:
campus.groupby('gender').workex.value_counts()

In [None]:
sns.countplot('specialisation',hue='gender',data=campus)

In [None]:
campus.groupby('specialisation').gender.value_counts()

1. There are more men in Marketing and Finance . 
2. Women to men ratio in Marketing and HR is better than Marketing and Financae

In [None]:
campus.groupby(['gender','specialisation']).status.value_counts()

In [None]:
sns.catplot('gender',hue='degree_t',data=campus,col='status',kind='count')

Most of the students are from commerce background


In [None]:
campus.groupby('hsc_b').specialisation.value_counts()

In [None]:
sns.catplot(x='hsc_b',hue='specialisation',data=campus,kind='count')

12th grade Other Board students are more likely to choose Marketing and Finance in MBA

In [None]:
sns.catplot(x='degree_t',hue='specialisation',data=campus,kind='count',col='status')

1. Most of the MBA students are from Commerce Background
2. And Mosr of the Commerces students opt for Marketing and Finance specilisation
3. Marketing and Finance has better placement record compared to other specilisations.

In [None]:
sns.catplot(x='degree_t',hue='workex',data=campus,kind='count')

Most of the commerce students don't have work experience

In [None]:
campus.groupby('specialisation').workex.value_counts()

In [None]:
sns.catplot(x='specialisation',hue='workex',data=campus,kind='count')

Most of the marketing and HR students don't have work experience [75%]

In [None]:
sns.relplot(x='ssc_p',y='hsc_p',data=campus,hue='status')

Students who scored more than 60% in their 10th grade are more likely to get placed

In [None]:
sns.relplot(x='degree_p',y='hsc_p',data=campus,hue='status')

Students with UG Aggregate more than 60% and 12th grade percentage with more than 60% are more likely to get placed


In [None]:
campus_cor=campus.corr()
campus_cor.drop('sl_no',inplace=True)
campus_cor.drop('sl_no',axis=1,inplace=True)

In [None]:
campus_cor

In [None]:
campus_cor.columns

In [None]:
campus_pairplot=campus[['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'salary','status']]

In [None]:
plt.figure(figsize=(25,35))
g=sns.pairplot(campus_pairplot,hue='status',palette='husl')



1.   From the first column of the plot we can infer that students with their school boards exam 70% are more likely to be placed
2.   From the first column of the plot we can infer that students with their high school boards exam 70% are more likely to be placed.
1.   From the first column of the plot we can infer that students with their Degree exam 65% are more likely to be placed.


In [None]:
sns.catplot(x='gender',y='ssc_p',data=campus,kind='swarm',hue='status')

Female students who have less than 60% in 10th grade % are not placed

# Let's analyse Salary

In [None]:
campus_DF_placed = campus[campus['status'] == 'Placed']
campus_DF_not_placed = campus[campus['status'] == 'Not Placed']

In [None]:
plt.figure(figsize=(10,5))
sns.kdeplot(campus_DF_placed['salary'], color = 'orange', shade = True)
plt.show()

The lowest salary offered to a student is 2LPA. The highest salary is 9.4LPA . The average salary of the placed students is 2.8LPA. 

In [None]:
plt.figure(figsize=(10, 5))
sns.boxenplot(x = 'salary', y = 'gender', data = campus_DF_placed, linewidth = 2.2)
plt.show()

In [None]:
campus_DF_placed.groupby('gender').describe()['salary']

The male students have a higher average salary than female students. The middle 50% for the female students has a higher range when compared to male students. Highest salary offered to a female student is ₹6,50,000 whereas it is ₹9,40,000 for a male student. Male students have more outliers, and thus there are more males that have been offered higher salaries when comapared to the majority of the students.

In [None]:
plt.figure(figsize=(10,5))
sns.boxenplot(x = 'salary', y = 'degree_t', data = campus_DF_placed, linewidth = 2.2)
plt.show()

In [None]:
campus_DF_placed.groupby('degree_t').describe()['salary']

Science & Technology students have the highest average, in terms of income. However, Commerce & Management students were able to procure more highly paid jobs. 

In [None]:
plt.figure(figsize=(10,5))
sns.boxenplot(x = 'salary', y = 'specialisation', data = campus_DF_placed, linewidth = 2.2)
plt.show()

In [None]:
campus_DF_placed.groupby('specialisation').describe()['salary']

Marketing & Finance students have a higher average salary. The maximum salary offered to a Marketing & Finance student (₹9,40,000) is significantly larger when compared to the maximum salary of a Marketing & HR student (₹4,50,000). There are more outliers in Marketing & Finance, which means that there are more opportunities for higher paid jobs in this specialisation.

## Future Developments
* A deeper analysis of all the features
* Predicting the placement of a student