### 1. Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [36]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,etest_p,mba_p,specialisation,workex,status
0,M,73.976975,Central,77.392691,Others,Science,73.623151,Sci&Tech,84.49831,67.207006,Mkt&HR,Yes,Placed
1,M,62.0,Others,62.0,Others,Commerce,60.0,Comm&Mgmt,63.0,52.38,Mkt&HR,Yes,Placed
2,M,80.965242,Others,67.104275,Others,Science,72.482621,Sci&Tech,91.343588,71.392313,Mkt&Fin,No,Placed
3,M,52.0,Others,65.0,Others,Arts,57.0,Others,75.0,59.81,Mkt&Fin,Yes,Not Placed
4,F,69.0,Central,62.0,Central,Science,66.0,Sci&Tech,75.0,67.99,Mkt&HR,No,Not Placed


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 670 entries, 0 to 669
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   gender          670 non-null    object 
 1   ssc_p           670 non-null    float64
 2   ssc_b           670 non-null    object 
 3   hsc_p           670 non-null    float64
 4   hsc_b           670 non-null    object 
 5   hsc_s           670 non-null    object 
 6   degree_p        670 non-null    float64
 7   degree_t        670 non-null    object 
 8   etest_p         670 non-null    float64
 9   mba_p           670 non-null    float64
 10  specialisation  670 non-null    object 
 11  workex          670 non-null    object 
 12  status          670 non-null    object 
dtypes: float64(5), object(8)
memory usage: 68.2+ KB


In [37]:
df.corr()

Unnamed: 0,ssc_p,hsc_p,degree_p,etest_p,mba_p
ssc_p,1.0,0.553111,0.56378,0.314477,0.46591
hsc_p,0.553111,1.0,0.528373,0.240976,0.442922
degree_p,0.56378,0.528373,1.0,0.259491,0.473506
etest_p,0.314477,0.240976,0.259491,1.0,0.289706
mba_p,0.46591,0.442922,0.473506,0.289706,1.0


### 2. Missing Data Analysis

In [38]:
# Checking missing values in each variable
df.isnull().sum()

gender            0
ssc_p             0
ssc_b             0
hsc_p             0
hsc_b             0
hsc_s             0
degree_p          0
degree_t          0
etest_p           0
mba_p             0
specialisation    0
workex            0
status            0
dtype: int64

### 3. Descriptive Analysis
##### 3.1 Data Description

In [39]:
# Understanding the statistics of the data
df.describe()

Unnamed: 0,ssc_p,hsc_p,degree_p,etest_p,mba_p
count,670.0,670.0,670.0,670.0,670.0
mean,68.960017,65.203006,65.229034,73.740025,62.993108
std,10.182563,11.222577,7.157608,12.10184,5.126855
min,42.864579,37.106246,50.0,50.0,51.29
25%,61.619318,60.43142,60.0,64.199327,59.431996
50%,68.332563,64.75327,64.906329,72.677821,62.481656
75%,77.130019,71.390402,70.271632,83.955186,66.864202
max,89.4,97.7,91.0,97.938047,77.89


In [56]:
df.nunique()

gender              2
ssc_p             594
ssc_b               2
hsc_p             595
hsc_b               2
hsc_s               3
degree_p          579
degree_t            3
etest_p           593
mba_p             666
specialisation      2
workex              2
status              2
dtype: int64

In [46]:
# Understanding the distribution of categorical variables
print(df.gender.value_counts(normalize=True))
print(df.ssc_b.value_counts(normalize=True))
print(df.hsc_b.value_counts(normalize=True))
print(df.hsc_s.value_counts(normalize=True))
print(df.degree_t.value_counts(normalize=True))
print(df.workex.value_counts(normalize=True))
print(df.specialisation.value_counts(normalize=True))
print(df.status.value_counts(normalize=True))

M    0.708955
F    0.291045
Name: gender, dtype: float64
Central    0.568657
Others     0.431343
Name: ssc_b, dtype: float64
Others     0.644776
Central    0.355224
Name: hsc_b, dtype: float64
Science     0.656716
Commerce    0.234328
Arts        0.108955
Name: hsc_s, dtype: float64
Sci&Tech     0.383582
Comm&Mgmt    0.311940
Others       0.304478
Name: degree_t, dtype: float64
Yes    0.513433
No     0.486567
Name: workex, dtype: float64
Mkt&HR     0.5
Mkt&Fin    0.5
Name: specialisation, dtype: float64
Placed        0.655224
Not Placed    0.344776
Name: status, dtype: float64


In [54]:
# Understanding percentage of placed and unplaced students within gender category
df.groupby('gender')['status'].value_counts(normalize=True)

gender  status    
F       Placed        0.815385
        Not Placed    0.184615
M       Placed        0.589474
        Not Placed    0.410526
Name: status, dtype: float64

This shows that majority of the females students got placed, i.e., 58% of male students got placed whereas the percentage of placed females are is 81%, this shows that 'Gender' could be one of the key variables in determining the placement status

In [55]:
# Understanding percentage of placed and unplaced students within education board category
df.groupby('ssc_b')['status'].value_counts(normalize=True)

ssc_b    status    
Central  Placed        0.742782
         Not Placed    0.257218
Others   Placed        0.539792
         Not Placed    0.460208
Name: status, dtype: float64

### 3.2 Data Visualisation

In [7]:
columns = df.columns.tolist()
print(columns)

['gender', 'ssc_p', 'ssc_b', 'hsc_p', 'hsc_b', 'hsc_s', 'degree_p', 'degree_t', 'etest_p', 'mba_p', 'specialisation', 'workex', 'status']


In [53]:
px.pie(data_frame=df, names='gender', title='Gender Distribution')

In [9]:
px.pie(data_frame=df ,names='ssc_b', title='Distribution of Board in Senior Secondary Education')

In [10]:
px.pie(data_frame=df ,names='hsc_b', title='Distribution of Board in Higher Secondary Education')

In [11]:
px.pie(data_frame=df ,names='hsc_s', title='Distribution of Subject in Higher Secondary Education')

In [13]:
px.pie(data_frame=df ,names='degree_t', title='Distribution of Degree')

In [14]:
px.pie(data_frame=df ,names='specialisation', title='Distribution of Mba Specialization')

In [15]:
px.pie(data_frame=df ,names='workex', title='Distribution of Work Experience')

In [16]:
px.pie(data_frame=df ,names='status', title='Status')

In [35]:
gender_df = df.groupby('gender')['ssc_p','hsc_p'].mean().reset_index(drop=False)
gender_df


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,gender,ssc_p,hsc_p
0,F,71.177262,68.900232
1,M,68.049779,63.685197


In [20]:
px.bar(gender_df, 'gender', 'ssc_p')

In [21]:
px.bar(gender_df, 'gender', 'hsc_p')

In [22]:
degree_df = df.groupby('degree_t')['ssc_p','hsc_p'].mean().reset_index(drop=False)
degree_df


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,degree_t,ssc_p,hsc_p
0,Comm&Mgmt,65.029704,66.909229
1,Others,65.76195,61.073243
2,Sci&Tech,74.694806,67.093554


In [23]:
px.bar(degree_df, 'degree_t', 'ssc_p')

In [24]:
px.bar(degree_df, 'degree_t', 'hsc_p')

In [25]:
specialisation_df = df.groupby('specialisation')['ssc_p','hsc_p'].mean().reset_index(drop=False)
specialisation_df


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,specialisation,ssc_p,hsc_p
0,Mkt&Fin,72.912102,69.302723
1,Mkt&HR,65.007932,61.103289


In [26]:
px.bar(specialisation_df, 'specialisation', 'ssc_p')

In [27]:
px.bar(specialisation_df, 'specialisation', 'hsc_p')

In [28]:
gender_df = df.groupby('gender')['etest_p','mba_p'].mean().reset_index(drop=False)
gender_df


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,gender,etest_p,mba_p
0,F,70.87387,64.449709
1,M,74.916657,62.395134


In [29]:
px.bar(gender_df, 'gender', 'etest_p')

In [30]:
px.bar(gender_df, 'gender', 'mba_p')