<a href="https://colab.research.google.com/github/saadkhalidabbasi/EDA-Projects/blob/main/Student_Dropout_Analysis_and_Prediction_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Importing Libraries**

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

### **Data Loading**

In [4]:
df=pd.read_csv('student dropout.csv')
df.head()

Unnamed: 0,School,Gender,Age,Address,Family_Size,Parental_Status,Mother_Education,Father_Education,Mother_Job,Father_Job,...,Free_Time,Going_Out,Weekend_Alcohol_Consumption,Weekday_Alcohol_Consumption,Health_Status,Number_of_Absences,Grade_1,Grade_2,Final_Grade,Dropped_Out
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,4,0,11,11,False
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,2,9,11,11,False
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,6,12,13,12,False
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,0,14,14,14,False
4,GP,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,0,11,13,13,False


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   School                       649 non-null    object
 1   Gender                       649 non-null    object
 2   Age                          649 non-null    int64 
 3   Address                      649 non-null    object
 4   Family_Size                  649 non-null    object
 5   Parental_Status              649 non-null    object
 6   Mother_Education             649 non-null    int64 
 7   Father_Education             649 non-null    int64 
 8   Mother_Job                   649 non-null    object
 9   Father_Job                   649 non-null    object
 10  Reason_for_Choosing_School   649 non-null    object
 11  Guardian                     649 non-null    object
 12  Travel_Time                  649 non-null    int64 
 13  Study_Time                   649 no

In [6]:
df.isnull().sum()

Unnamed: 0,0
School,0
Gender,0
Age,0
Address,0
Family_Size,0
Parental_Status,0
Mother_Education,0
Father_Education,0
Mother_Job,0
Father_Job,0


In [7]:
df.duplicated().sum()

0

In [8]:
df['Dropped_Out'].value_counts()

Unnamed: 0_level_0,count
Dropped_Out,Unnamed: 1_level_1
False,549
True,100


In [9]:
data_types = df.dtypes
print(data_types)

School                         object
Gender                         object
Age                             int64
Address                        object
Family_Size                    object
Parental_Status                object
Mother_Education                int64
Father_Education                int64
Mother_Job                     object
Father_Job                     object
Reason_for_Choosing_School     object
Guardian                       object
Travel_Time                     int64
Study_Time                      int64
Number_of_Failures              int64
School_Support                 object
Family_Support                 object
Extra_Paid_Class               object
Extra_Curricular_Activities    object
Attended_Nursery               object
Wants_Higher_Education         object
Internet_Access                object
In_Relationship                object
Family_Relationship             int64
Free_Time                       int64
Going_Out                       int64
Weekend_Alco

In [10]:
df.describe().transpose()


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,649.0,16.744222,1.218138,15.0,16.0,17.0,18.0,22.0
Mother_Education,649.0,2.514638,1.134552,0.0,2.0,2.0,4.0,4.0
Father_Education,649.0,2.306626,1.099931,0.0,1.0,2.0,3.0,4.0
Travel_Time,649.0,1.568567,0.74866,1.0,1.0,1.0,2.0,4.0
Study_Time,649.0,1.930663,0.82951,1.0,1.0,2.0,2.0,4.0
Number_of_Failures,649.0,0.22188,0.593235,0.0,0.0,0.0,0.0,3.0
Family_Relationship,649.0,3.930663,0.955717,1.0,4.0,4.0,5.0,5.0
Free_Time,649.0,3.180277,1.051093,1.0,3.0,3.0,4.0,5.0
Going_Out,649.0,3.1849,1.175766,1.0,2.0,3.0,4.0,5.0
Weekend_Alcohol_Consumption,649.0,1.502311,0.924834,1.0,1.0,1.0,2.0,5.0


In [11]:
df.columns

Index(['School', 'Gender', 'Age', 'Address', 'Family_Size', 'Parental_Status',
       'Mother_Education', 'Father_Education', 'Mother_Job', 'Father_Job',
       'Reason_for_Choosing_School', 'Guardian', 'Travel_Time', 'Study_Time',
       'Number_of_Failures', 'School_Support', 'Family_Support',
       'Extra_Paid_Class', 'Extra_Curricular_Activities', 'Attended_Nursery',
       'Wants_Higher_Education', 'Internet_Access', 'In_Relationship',
       'Family_Relationship', 'Free_Time', 'Going_Out',
       'Weekend_Alcohol_Consumption', 'Weekday_Alcohol_Consumption',
       'Health_Status', 'Number_of_Absences', 'Grade_1', 'Grade_2',
       'Final_Grade', 'Dropped_Out'],
      dtype='object')

In [12]:
from scipy import stats

# Calculate Z-scores of each value in the DataFrame
z_scores = stats.zscore(df.select_dtypes(include=['float64', 'int64']))

# Identify outliers (Z-score > 3 or < -3)
df_outliers = df[(z_scores > 3).any(axis=1) | (z_scores < -3).any(axis=1)]

In [13]:
# Remove rows with outliers based on Z-score
df_no_outliers = df[(z_scores < 3).all(axis=1) & (z_scores > -3).all(axis=1)]

## **Student Dropout Detailed Analysis**

#### **Distribution of Dropout Status**

In [14]:
import plotly.express as px

fig = px.bar(df, x='Dropped_Out', title='Distribution of Dropout Status')
fig.show()

#### **Distribution of Student Ages**

In [15]:
fig = px.histogram(df, x='Age', title='Age Distribution of Students')
fig.show()

#### **Final Grades Distribution**

In [16]:
fig = px.histogram(df, x='Final_Grade', title='Distribution of Final Grades')
fig.show()

#### **Correlation Matrix**

In [17]:
import plotly.express as px

correlation_matrix = df.select_dtypes(include=['float64', 'int64']).corr()
fig = px.imshow(correlation_matrix, text_auto=True, title='Correlation Matrix', width=1000, height=1000)
fig.show()

#### **Study Time Relate to Final Grades**

In [18]:
fig = px.scatter(df, x='Study_Time', y='Final_Grade', color='Dropped_Out', title='Study Time vs Final Grade')
fig.show()

#### **Distribution of Gender**

In [19]:
fig = px.pie(df, names='Gender', title='Gender Distribution')
fig.show()