---
<h1 align='center'> Study of Student's Early Attrition</h1>

---

<b>Goal
1. Identify key drivers of early student attrition
2. Build a predictive model to identify students with higher early attrition risk
3. Recommend appropriate interventions based on the analysis</b>

In [1]:
#importing necessary libraries 
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as pex
%matplotlib inline

In [2]:
#loadting the data into dataframe
data = pd.read_excel("student.xlsx", sheet_name="student")

---

<b>Data Wrangling</b>
---

In [3]:
data.head()

Unnamed: 0,STUDENT IDENTIFIER,STDNT_AGE,STDNT_GENDER,STDNT_BACKGROUND,IN_STATE_FLAG,INTERNATIONAL_STS,STDNT_MAJOR,STDNT_MINOR,STDNT_TEST_ENTRANCE1,STDNT_TEST_ENTRANCE2,...,DEGREE_GROUP_CD,DEGREE_GROUP_DESC,FIRST_TERM_ATTEMPT_HRS,FIRST_TERM_EARNED_HRS,SECOND_TERM_ATTEMPT_HRS,SECOND_TERM_EARNED_HRS,GROSS_FIN_NEED,COST_OF_ATTEND,EST_FAM_CONTRIBUTION,UNMET_NEED
0,7808615,18,F,BGD 1,Y,N,Undeclared,N,,1150.0,...,B,Bachelors,16,16,14.0,14.0,0,0,0,0.0
1,7830063,19,F,BGD 1,N,N,Undeclared,N,26.0,,...,B,Bachelors,18,18,18.0,18.0,570000,1355760,785760,459300.0
2,7847538,18,M,BGD 1,Y,N,Mathematics,N,,1020.0,...,B,Bachelors,15,15,14.0,14.0,0,0,0,0.0
3,8006429,18,M,BGD 1,Y,N,Undeclared,N,,1210.0,...,B,Bachelors,13,13,14.0,14.0,0,0,0,0.0
4,7962680,18,F,BGD 1,Y,N,Art,N,,1200.0,...,B,Bachelors,12,12,12.0,12.0,835920,1355760,519840,278340.0


In [4]:
data.tail()

Unnamed: 0,STUDENT IDENTIFIER,STDNT_AGE,STDNT_GENDER,STDNT_BACKGROUND,IN_STATE_FLAG,INTERNATIONAL_STS,STDNT_MAJOR,STDNT_MINOR,STDNT_TEST_ENTRANCE1,STDNT_TEST_ENTRANCE2,...,DEGREE_GROUP_CD,DEGREE_GROUP_DESC,FIRST_TERM_ATTEMPT_HRS,FIRST_TERM_EARNED_HRS,SECOND_TERM_ATTEMPT_HRS,SECOND_TERM_EARNED_HRS,GROSS_FIN_NEED,COST_OF_ATTEND,EST_FAM_CONTRIBUTION,UNMET_NEED
3395,7948112,18,F,BGD 4,N,N,Music Performance,N,,800.0,...,B,Bachelors,18,18,16.0,13.0,0,0,0,0.0
3396,8023055,18,F,BGD 1,Y,N,Biology,N,20.0,,...,B,Bachelors,13,9,,,0,0,0,0.0
3397,7926915,18,F,BGD 4,Y,N,Biology,N,,1020.0,...,B,Bachelors,14,6,,,0,945840,1288980,0.0
3398,7877332,18,F,BGD 3,Y,N,Joint Enrollment - Accel,N,21.0,,...,B,Bachelors,12,9,,,0,269100,0,-231720.0
3399,7928405,18,M,BGD 2,Y,N,Art,Creative Writing,,890.0,...,B,Bachelors,12,9,13.0,6.0,0,0,0,0.0


In [5]:
data.shape
#rows and columns

(3400, 56)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3400 entries, 0 to 3399
Data columns (total 56 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   STUDENT IDENTIFIER        3400 non-null   int64  
 1   STDNT_AGE                 3400 non-null   int64  
 2   STDNT_GENDER              3400 non-null   object 
 3   STDNT_BACKGROUND          3400 non-null   object 
 4   IN_STATE_FLAG             3400 non-null   object 
 5   INTERNATIONAL_STS         3400 non-null   object 
 6   STDNT_MAJOR               3400 non-null   object 
 7   STDNT_MINOR               3400 non-null   object 
 8   STDNT_TEST_ENTRANCE1      1106 non-null   float64
 9   STDNT_TEST_ENTRANCE2      2492 non-null   float64
 10  STDNT_TEST_ENTRANCE_COMB  2882 non-null   float64
 11  FIRST_TERM                3400 non-null   int64  
 12  CORE_COURSE_NAME_1_F      3400 non-null   object 
 13  CORE_COURSE_GRADE_1_F     3400 non-null   object 
 14  CORE_COU

In [7]:
#null values in data
data.isnull().sum()

STUDENT IDENTIFIER             0
STDNT_AGE                      0
STDNT_GENDER                   0
STDNT_BACKGROUND               0
IN_STATE_FLAG                  0
INTERNATIONAL_STS              0
STDNT_MAJOR                    0
STDNT_MINOR                    0
STDNT_TEST_ENTRANCE1        2294
STDNT_TEST_ENTRANCE2         908
STDNT_TEST_ENTRANCE_COMB     518
FIRST_TERM                     0
CORE_COURSE_NAME_1_F           0
CORE_COURSE_GRADE_1_F          0
CORE_COURSE_NAME_2_F          99
CORE_COURSE_GRADE_2_F         99
CORE_COURSE_NAME_3_F         565
CORE_COURSE_GRADE_3_F        565
CORE_COURSE_NAME_4_F        1597
CORE_COURSE_GRADE_4_F       1597
CORE_COURSE_NAME_5_F        2755
CORE_COURSE_GRADE_5_F       2755
CORE_COURSE_NAME_6_F        3272
CORE_COURSE_GRADE_6_F       3272
SECOND_TERM                    0
CORE_COURSE_NAME_1_S         157
CORE_COURSE_GRADE_1_S        232
CORE_COURSE_NAME_2_S         439
CORE_COURSE_GRADE_2_S        439
CORE_COURSE_NAME_3_S        1038
CORE_COURS

In [8]:
#calculate the missing value %
print((data.isnull().sum() / data.shape[0])*100)

STUDENT IDENTIFIER           0.000000
STDNT_AGE                    0.000000
STDNT_GENDER                 0.000000
STDNT_BACKGROUND             0.000000
IN_STATE_FLAG                0.000000
INTERNATIONAL_STS            0.000000
STDNT_MAJOR                  0.000000
STDNT_MINOR                  0.000000
STDNT_TEST_ENTRANCE1        67.470588
STDNT_TEST_ENTRANCE2        26.705882
STDNT_TEST_ENTRANCE_COMB    15.235294
FIRST_TERM                   0.000000
CORE_COURSE_NAME_1_F         0.000000
CORE_COURSE_GRADE_1_F        0.000000
CORE_COURSE_NAME_2_F         2.911765
CORE_COURSE_GRADE_2_F        2.911765
CORE_COURSE_NAME_3_F        16.617647
CORE_COURSE_GRADE_3_F       16.617647
CORE_COURSE_NAME_4_F        46.970588
CORE_COURSE_GRADE_4_F       46.970588
CORE_COURSE_NAME_5_F        81.029412
CORE_COURSE_GRADE_5_F       81.029412
CORE_COURSE_NAME_6_F        96.235294
CORE_COURSE_GRADE_6_F       96.235294
SECOND_TERM                  0.000000
CORE_COURSE_NAME_1_S         4.617647
CORE_COURSE_

---
<b> Data Dictionary </b>

STUDENT IDENTIFIER	Student Identifier
STDNT_AGE	Age of the Student Enrolled
STDNT_GENDER	Gender of the student
STDNT_BACKGROUND	Backgroud of Student
IN_STATE_FLAG	Indicator of whether Student is in same state as university
INTERNATIONAL_STS	Indicator of whether Student is an International Student
STDNT_MAJOR	Student's Major course in University
STDNT_MINOR	Student's Minor course in University
STDNT_TEST_ENTRANCE1	Student's Entrance 1 score
STDNT_TEST_ENTRANCE2	Student's Entrance 2 score
STDNT_TEST_ENTRANCE_COMB	Student's score calculated both on Entrance1 & Entrance2 score
FIRST_TERM	First semester year
CORE_COURSE_NAME_1_F	Core course 1 opted in First semester
CORE_COURSE_GRADE_1_F	Grade in Core course 1 opted in First semester
CORE_COURSE_NAME_2_F	Core course 2 opted in First semester
CORE_COURSE_GRADE_2_F	Grade in Core course 2 opted in First semester
CORE_COURSE_NAME_3_F	Core course 3 opted in First semester
CORE_COURSE_GRADE_3_F	Grade in Core course 3 opted in First semester
CORE_COURSE_NAME_4_F	Core course 4 opted in First semester
CORE_COURSE_GRADE_4_F	Grade in Core course 4 opted in First semester
CORE_COURSE_NAME_5_F	Core course 5 opted in First semester
CORE_COURSE_GRADE_5_F	Grade in Core course 5 opted in First semester
CORE_COURSE_NAME_6_F	Core course 6 opted in First semester
CORE_COURSE_GRADE_6_F	Grade in Core course 6 opted in First semester
SECOND_TERM	Second semester year
CORE_COURSE_NAME_1_S	Core course 1 opted in Second semester
CORE_COURSE_GRADE_1_S	Grade in Core course 1 opted in Second semester
CORE_COURSE_NAME_2_S	Core course 2 opted in Second semester
CORE_COURSE_GRADE_2_S	Grade in Core course 2 opted in Second semester
CORE_COURSE_NAME_3_S	Core course 3 opted in Second semester
CORE_COURSE_GRADE_3_S	Grade in Core course 3 opted in Second semester
CORE_COURSE_NAME_4_S	Core course 4 opted in Second semester
CORE_COURSE_GRADE_4_S	Grade in Core course 4 opted in Second semester
CORE_COURSE_NAME_5_S	Core course 5 opted in Second semester
CORE_COURSE_GRADE_5_S	Grade in Core course 5 opted in Second semester
CORE_COURSE_NAME_6_S	Core course 6 opted in Second semester
CORE_COURSE_GRADE_6_S	Grade in Core course 6 opted in Second semester
HOUSING_STS	Indicator of whether the student is staying in campus or outside
RETURNED_2ND_YR	Indicates whether the student came back to First semester in 2nd year
DISTANCE_FROM_HOME	Distance from the university to student's home
HIGH_SCHL_GPA	Student's High School GPA score
HIGH_SCHL_NAME	High School from where the student graduated
FATHER_HI_EDU_CD	Father's educational status code
FATHER_HI_EDU_DESC	Father's educational status
MOTHER_HI_EDU_CD	Mother's educational status code
MOTHER_HI_EDU_DESC	Mother's educational status
DEGREE_GROUP_CD	Degree code for which student has enrolled in university
DEGREE_GROUP_DESC	Degree for which student has enrolled in university
FIRST_TERM_ATTEMPT_HRS	# Hours attempted by student(Or # Grade points attempted by Student in First semester) 
FIRST_TERM_EARNED_HRS	# Hours earned by student(Or # Grade points earned by Student in First semester) 
SECOND_TERM_ATTEMPT_HRS	# Hours attempted by student(Or # Grade points attempted by Student in Second semester) 
SECOND_TERM_EARNED_HRS	# Hours earned by student(Or # Grade points earned by Student in Second semester) 
GROSS_FIN_NEED	Financial need of Student
COST_OF_ATTEND	Course Fees
EST_FAM_CONTRIBUTION	Estimated Family contribution towards course fees
UNMET_NEED	Unmet financial need of the student

In [9]:
#findout age groups in student
data['STDNT_AGE'].unique()

array([18, 19, 17, 20, 16, 21, 23, 25, 22, 26, 24], dtype=int64)

In [None]:
#visualize the distribution of age
fig1 = pex.histogram(data,x='STDNT_AGE')
fig1.show()

In [None]:
#find out backgrounds of student
data['STDNT_BACKGROUND'].unique()

In [None]:
#findout dominating student background
plt.figure(figsize=(5,5))

sns.countplot(data['STDNT_BACKGROUND'])
plt.title("Backgroud of Student",fontsize=15,color='red')
plt.ylabel("Count",fontsize=15)
plt.xlabel("Backgroud of Student",fontsize=15)
plt.show()

In [None]:
#findout whether Student is in same state

plt.figure(figsize=(5,5))
sns.countplot(data['IN_STATE_FLAG'])
plt.title("Indicator of whether Student is in same state as university",fontsize=15,color='red')
plt.ylabel("Count",fontsize=15)
plt.xlabel("In state",fontsize=15)
plt.show()

In [None]:
#findout international student vs national students

plt.figure(figsize=(5,5))

sns.countplot(data['INTERNATIONAL_STS'])
plt.title("Indicator of whether Student is international",fontsize=15,color='red')
plt.ylabel("Count",fontsize=15)
plt.xlabel("International",fontsize=15)
plt.show()