In [None]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff


from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


PATH = "/kaggle/input/predict-test-scores-of-students/"

# Loading and understanding the dataset
The dataset has some information about 2133 students. The given info contains:
* gender of the student
* the school the student attends,
* school type
* school setting
* classroom of the student,
* teaching method in the classroom,
* Number of students in the classroom
* pretest and posttest scores of the student
There is also a `lunch` column and it has 2 unique values: `Does not qualify`, `Qualifies for reduced/free lunch`.
I'm assuming that this qualification is given randomly and __before__ the post-test. 

The student's performance on the test depends on too many things and this dataset, of course, couldn't include every single piece of information about the students, we'll search for some usable indicators of the test scores.
The `school type` presumably would have an impact on the student's performance since the quality of the education would differ. It is also understandable that the `teaching methods` have an effect on the test scores because the effectiveness of the education mostly depends on qualified teachers and the educational system with enough materials for students to learn. Also, the number of students in the classroom would affect the student's performance since it is very difficult to take care of everyone as a teacher in any condition.

Everything above is just assumptions and we need insights to find out if they are correct or not. Then, let's start!

In [None]:
dataset = pd.read_csv(PATH + "test_scores.csv")
dataset.head()

There are many information about the schools so I'll build a dataframe for them 

In [None]:
ds_school_group = dataset.groupby('school') # group the dataset values by the school names

schools = dataset['school'].unique()
schools_df = pd.DataFrame(columns = ['school_setting', 'school_type', 'n_classroom', 'total_student', 'n_experimental', 'n_standard'], index = schools)

# I'll extract some information about the classes and the schools. 
schools_df.loc[:, 'school_setting'] = ds_school_group.school_setting.unique().apply(lambda x: x[0])
schools_df.loc[:, 'school_type'] = ds_school_group.school_type.unique().apply(lambda x: x[0])
schools_df.loc[:, 'n_classroom'] = ds_school_group.classroom.nunique()
schools_df.loc[:, 'total_student'] = ds_school_group.teaching_method.value_counts().groupby(level=0).sum()
schools_df.loc[:, 'n_experimental'] = ds_school_group.teaching_method.value_counts().xs('Experimental', level = 1)
schools_df.loc[:, 'n_standard'] = ds_school_group.teaching_method.value_counts().xs('Standard', level = 1)


# there are some schools that doesn't have any classrooms for experimental teaching methods, so we can fill the NaN values with 0
# there aren't any NaN values in the extracted information
schools_df.fillna(0, inplace = True)
schools_df.head()

In [None]:
fig = make_subplots(rows = 1, cols = 2, subplot_titles=('School Setting distribution', 'School Type Distribution'), specs = [[{'type': 'pie'}] * 2])

school_setting = schools_df['school_setting'].value_counts()
school_type = schools_df['school_type'].value_counts()

fig.add_trace(
    go.Pie(labels = school_setting.index, values = school_setting.values), row = 1, col = 1
)

fig.add_trace(
    go.Pie(labels = school_type.index, values = school_type.values), row = 1, col = 2
)

fig.show()

Most of the students are learning with standard educational methods and some schools didn't even try any experimental methods, there is also `FBUMG` and it just tried the experimental methods for learning. In few schools, there are more students that learn with experimental methods than the students that learn with standard methods.

In [None]:
fig = px.bar(schools_df, x = schools_df.index, y = ['n_experimental', 'n_standard'], title = '# Student distribution',
             labels = {'index': 'Schools', 'total_student': 'Number of students'})
fig.show()

It is pretty understandable that public schools have much more classrooms because of the accessibility of public schools.

In [None]:
fig = px.bar(schools_df, x = schools_df.index, y = 'n_classroom', title = '# Classrooms', color = 'school_type', 
             labels = {'index': 'Schools', 'n_classroom': 'Number of classrooms'})
fig.show()

I think experimental and standard education method usage by different school types are pretty close and normal. But the ratio of the student distributions of different educational methods is more stable in private schools. In public schools, it really is just an experimental method.

In [None]:
np_schools = schools_df[schools_df['school_type'] == 'Non-public']
p_schools = schools_df[schools_df['school_type'] == 'Public']

fig = make_subplots(rows=1, cols=2, subplot_titles = ('Private Schools', 'Public Schools'))

fig.add_trace(
    go.Bar(name = 'Standard', x = np_schools.index, y = np_schools['n_standard']), row = 1, col = 1
)
fig.add_trace(
    go.Bar(name = 'Experimental', x = np_schools.index, y = np_schools['n_experimental']), row = 1, col = 1
)

fig.add_trace(
    go.Bar(name = 'Standard', x = p_schools.index, y = p_schools['n_standard']), row = 1, col = 2
)
fig.add_trace(
    go.Bar(name = 'Experimental', x = p_schools.index, y = p_schools['n_experimental']), row = 1, col = 2
)

fig.update_layout({'barmode':'stack', 'title_text': 'Experimental/Standard Education Distribution by School Type'})

fig.show()

There are `918` students who qualified for free lunch or lunch with reduced prices while `1215` of the students don't. Also, we can see that the ratio of qualified/not qualified students is pretty unstable in schools. But of course, this is understandable because it wouldn't be a `prize` if everyone earned it

In [None]:
fig = make_subplots(rows = len(schools) // 5 + 1, cols = 5, subplot_titles=schools, shared_yaxes=True)

c = 1
for idx, school in enumerate(schools):
    school_data = dataset[dataset['school'] == school]
    values = school_data.groupby('classroom')['lunch'].value_counts()
    
    df = pd.DataFrame(index = school_data.classroom.unique(), columns = 'n_qualified n_not_qualified'.split())

    # some schools doesn't have any "qualified" or "not qualified" students so try to extract the data and do nothing if there aren't any student
    try:
        df['n_qualified'] = values.xs('Qualifies for reduced/free lunch', level = 1)
        fig.append_trace(go.Bar(name = "Qualified", x = school_data['classroom'].unique(), y = df['n_qualified'], marker_color = '#4C78A8', legendgroup="qualified"), row = c, col = (idx % 5) + 1)
    except: pass
    try:
        df['n_not_qualified'] =  values.xs('Does not qualify', level = 1)
        fig.append_trace(go.Bar(name = "Not Qualified", x = school_data['classroom'].unique(), y = df['n_not_qualified'], marker_color = '#F58518', legendgroup="not_qualified"), row = c, col = (idx % 5) + 1)
    except: pass

    if (idx + 1)%5==0: c+=1 


# since adding extra traces also adds extra legends to plotly subplots, I'll be filtering the extra ones manually
legend_names = set()
fig.for_each_trace(
    lambda trace: trace.update(showlegend=False) if trace.name in legend_names else legend_names.add(trace.name)
)


fig.update_layout(height = 1200, width = 1200, title_text = "Distribution of qualified/not qualified students by classrooms")
fig.show()

There are few outliers but I guess I was wrong. I thought that lunch prize would motivate students but it doesn't seem to affect that much to the student's performance. We can also say that experimental educational methods were more effective.

In [None]:
# Pretest/Posttest Score Distribution of Qualified/Not qualified stuents.
qualified = dataset[dataset['lunch'] == "Qualifies for reduced/free lunch"]
not_qualified = dataset[dataset['lunch'] == "Does not qualify"]


fig = make_subplots(rows = 2, cols = 1, subplot_titles = ['Pretest Score Distribution of Qualified/Not Qualified Students','Posttest Score Distribution of Qualified/Not Qualified Students'])

fig_a = ff.create_distplot([qualified['pretest'], not_qualified['pretest']], ['Qualified Students', 'Not Qualified Students'], bin_size = 1)
fig_b = ff.create_distplot([qualified['posttest'], not_qualified['posttest']], ['Qualified Students', 'Not Qualified Students'], bin_size = 1)

for i, f in enumerate([fig_a, fig_b], 1):
    for trace in f.data:
        fig.append_trace(trace, row = i, col = 1)

legend_names = set()
fig.for_each_trace(
    lambda trace: trace.update(showlegend=False) if trace.name in legend_names else legend_names.add(trace.name)
)
fig.update_layout(title_text = 'Pretest/Posttest Score Distribution of Qualified/Not qualified students.')


fig.show()

In [None]:
# Pretest/Posttest Score Distribution of Qualified/Not qualified students.
exp_teaching = dataset[dataset['teaching_method'] == "Experimental"]
sta_teaching = dataset[dataset['teaching_method'] == "Standard"]

fig = make_subplots(rows = 2, cols = 1, subplot_titles = ['Pretest Score Distribution of Students by Teaching Methods','Posttest Score Distribution of Students by Teaching Methods'])

fig_a = ff.create_distplot([exp_teaching['pretest'], sta_teaching['pretest']], ['Experimental', 'Standard'], bin_size = 1)
fig_b = ff.create_distplot([exp_teaching['posttest'], sta_teaching['posttest']], ['Experimental', 'Standard'], bin_size = 1)

for i, f in enumerate([fig_a, fig_b], 1):
    for trace in f.data:
        fig.append_trace(trace, row = i, col = 1)

legend_names = set()
fig.for_each_trace(
    lambda trace: trace.update(showlegend=False) if trace.name in legend_names else legend_names.add(trace.name)
)
fig.update_layout(title_text = 'Pretest/Posttest Score Distribution of Students with Different Teaching Methods')


fig.show()

In [None]:
np_schools = dataset[dataset['school_type'] == 'Non-public']
p_schools = dataset[dataset['school_type'] == 'Public']

fig = make_subplots(rows = 2, cols = 1, subplot_titles = ('Pretest/Posttest score distribution of qualified students', 'Pretest/Posttest score distribution of not qualified students'))

fig.append_trace(go.Box(x = np_schools['school'], y = np_schools[np_schools['lunch'] == 'Qualifies for reduced/free lunch']['pretest'], name = 'Pretest Score', legendgroup = 'group1', marker_color = '#4C78A8'), 1, 1)
fig.append_trace(go.Box(x = np_schools['school'], y = np_schools[np_schools['lunch'] == 'Qualifies for reduced/free lunch']['posttest'], name = 'Posttest Score', legendgroup = 'group2', marker_color = '#F58518'), 1, 1)

fig.append_trace(go.Box(x = np_schools['school'], y = np_schools[np_schools['lunch'] == 'Does not qualify']['pretest'], name = 'Pretest Score', legendgroup = 'group1', marker_color = '#4C78A8'), 2, 1)
fig.append_trace(go.Box(x = np_schools['school'], y = np_schools[np_schools['lunch'] == 'Does not qualify']['posttest'], name = 'Posttest Score', legendgroup = 'group2', marker_color = '#F58518'), 2, 1)
fig.update_layout(height = 800, width = 1200, title_text = 'Non-Public Schools', boxmode = 'group')

name = set()
fig.for_each_trace(lambda t: t.update(showlegend=False) if t.name in name else name.add(t.name))

fig.show()

In [None]:
fig = make_subplots(rows = 2, cols = 1, subplot_titles = ('Pretest/Posttest score distribution of qualified students', 'Pretest/Posttest score distribution of not qualified students'))

fig.append_trace(go.Box(x = p_schools['school'], y = p_schools[p_schools['lunch'] == 'Qualifies for reduced/free lunch']['pretest'], name = 'Pretest Score', legendgroup = 'group1', marker_color = '#4C78A8'), 1, 1)
fig.append_trace(go.Box(x = p_schools['school'], y = p_schools[p_schools['lunch'] == 'Qualifies for reduced/free lunch']['posttest'], name = 'Posttest Score', legendgroup = 'group2', marker_color = '#F58518'), 1, 1)

fig.append_trace(go.Box(x = p_schools['school'], y = p_schools[p_schools['lunch'] == 'Does not qualify']['pretest'], name = 'Pretest Score', legendgroup = 'group1', marker_color = '#4C78A8'), 2, 1)
fig.append_trace(go.Box(x = p_schools['school'], y = p_schools[p_schools['lunch'] == 'Does not qualify']['posttest'], name = 'Posttest Score', legendgroup = 'group2', marker_color = '#F58518'), 2, 1)
fig.update_layout(height = 800, width = 1200, title_text = 'Non-Public Schools', boxmode = 'group')

name = set()
fig.for_each_trace(lambda t: t.update(showlegend=False) if t.name in name else name.add(t.name))

fig.show()

Also it looks like there's a linear correlation between `pretest` and `posttest` scores

In [None]:
fig = px.scatter(dataset, x = dataset['pretest'],y = dataset['posttest'], color = 'lunch')
fig.show()

# Preparing the Dataset and Building a Model
In order to build a model, we should have a good understanding of the dataset. We can try to build a linear model because of the linear correlation between `pretest` and `posttest` scores. If it performs well, I won't try to dive deep into the model building process

In [None]:
# I won't use every categorical variable, `school`, `classroom`, `student_id` variables will be excluded
cat_feats = ['school_setting', 'school_type', 'teaching_method', 'gender', 'lunch']

target = dataset['posttest'].values
data = dataset.iloc[:, :-1]

data = pd.get_dummies(data, columns = cat_feats)
# scale the data
data_mean = data.loc[:, ['n_student', 'pretest']].mean().values
data_std = data.loc[:, ['n_student', 'pretest']].std().values
data.loc[:, ['n_student', 'pretest']] = (data.loc[:, ['n_student', 'pretest']] - data_mean) / data_std
# normalize the target data using pretest's mean and std
target = (target - data_mean[1]) / data_std[1]

data.drop(columns=['school', 'classroom', 'student_id'], inplace = True)
data.head()

In [None]:
def adjusted_r2(y_true, y_pred):
    """ 
    Returns adjusted R^2 score of the given true target data and predictions, adjusted r^2 also penalizes additional independent variables in the training set.
    Variable `p` on the below is the number of independent variables used to predict the target. 
    """
    N = y_true.shape[0]
    r2 = r2_score(y_true, y_pred)
    p = 13
    score = 1 - ( (1-r2) * (N-1) ) / (N - p - 1)
    return score

x_train, x_test, y_train, y_test = train_test_split(data, target, test_size = .25)

lr = LinearRegression().fit(x_train, y_train)
train_preds = lr.predict(x_train)
preds = lr.predict(x_test)


print("Linear model's performance on the training and test data")
print("train: \t MSE: %f | MAE: %f | R^2: %f | Adjusted R^2: %f"%(mean_squared_error(y_train, train_preds), mean_absolute_error(y_train, train_preds), r2_score(y_train, train_preds), adjusted_r2(y_train, train_preds)))
print("test: \t MSE: %f | MAE: %f | R^2: %f | Adjusted R^2: %f"%(mean_squared_error(y_test, preds), mean_absolute_error(y_test, preds), r2_score(y_test, preds), adjusted_r2(y_test, preds)))

# Conclusion
Well, at first I thought that there are not enough data for the prediction. But with given features, the linear model did a pretty good job. 
We can say that model didn't overfit the training data and it's quite robust since r^2 and adjusted r^2 don't differ too much.

Since I don't work on tabular data too much, I thought it would be a good exercise to do exploratory analysis on this dataset. If you read this far, I'm glad. 
Please upvote if you liked the notebook. Thank you very much for your time :)