In [34]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import f_oneway
from sklearn.metrics import mean_absolute_error 
import altair as alt

In [35]:
df = pd.read_csv('StudentPerformanceFactors.csv')

In [36]:
df = df.dropna() 

df = df[df['Tutoring_Sessions'] != 8]

In [37]:
sum_stats = df['Exam_Score'].describe() 

lower_bound = sum_stats['mean'] - 3 * sum_stats['std']
upper_bound = sum_stats['mean'] + 3 * sum_stats['std'] 
filtered_df = df[(df['Exam_Score'] > lower_bound) & (df['Exam_Score'] < upper_bound)] 

In [38]:
df = df.iloc[:5000]

In [39]:
categorical_columns = ['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 'Gender']

In [40]:
df

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5171,12,74,Low,High,Yes,6,55,High,Yes,0,Medium,Low,Public,Positive,3,No,Postgraduate,Near,Male,63
5172,18,84,High,Low,No,6,64,Medium,Yes,2,Low,High,Public,Positive,2,No,High School,Near,Female,67
5173,14,82,Medium,Medium,Yes,4,67,Medium,Yes,1,High,Medium,Public,Neutral,4,No,High School,Moderate,Female,65
5174,23,76,Medium,Medium,No,7,66,Medium,Yes,2,Medium,Medium,Public,Neutral,3,No,College,Near,Male,67


In [41]:
slider = alt.binding_range(
    min=df['Exam_Score'].min(), 
    max=df['Exam_Score'].max(), 
    name="Exam Score", 
    step=1
)

selection = alt.selection_point(bind=slider, fields=['Exam_Score'])


chart1 = alt.Chart(df).transform_filter(
    selection
).mark_circle().encode(
    alt.X('Sleep_Hours:Q', scale=alt.Scale(domain=[4, 10]), title='Sleep Hours'),
    alt.Y('Exam_Score:Q', scale=alt.Scale(domain=[50, 101]), title='Exam Score'),
    color=alt.Color('Teacher_Quality:N', title='Teacher Quality'),
    tooltip=['Sleep_Hours', 'Exam_Score', 'Hours_Studied', 'Teacher_Quality']
).add_params(
    selection
).properties(title="Sleep Hours Vs Exam Scores Segmented by Teacher Quality")


chart2 = chart1.encode(x='Hours_Studied:Q').properties(title = \
                                                       "Hours Studied Compared To Exam Scores Labeled By Teacher Quality")

chart1 | chart2

In [42]:
selection = alt.selection_multi(on='mouseover')


chart = alt.Chart(df).mark_circle().encode(
    x=alt.X('Attendance', scale=alt.Scale(type='linear', domain=[60, 100])),
    y=alt.Y('Exam_Score', scale=alt.Scale(type='linear', domain=[50, 105]),  title="Exam Scores"), 
    color=alt.condition(selection, 'Access_to_Resources', alt.value('lightgray'), title="Access To Resources"),
    size=alt.value(100),
    tooltip=['Attendance', 'Exam_Score', 'Access_to_Resources', 'Family_Income']
).add_selection(
    selection
)



  selection = alt.selection_multi(on='mouseover')
  ).add_selection(


In [43]:
access_to_resources = ['High', 'Medium', 'Low'] 


rating_res = alt.binding_radio(
    options=access_to_resources, 
    name="Access to Resources"
)


rating_select = alt.selection_point(
    fields=['Access_to_Resources'], 
    bind=rating_res
)


rating_color_condition = alt.condition(
    rating_select,
    alt.Color('Access_to_Resources:N'),
    alt.value('lightgray')
)


opacity_condition = alt.condition(
    rating_select,
    alt.value(1),  
    alt.value(0)  
)

highlight_ratings = chart.add_params(
    rating_select
).encode(
    color=rating_color_condition,  
    opacity=opacity_condition  
).properties(title="Attendence Vs Exam Scores Segmented By Access To Resources") 


highlight_ratings