In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Reference:** https://www.kaggle.com/marianaosborne/test-scores-visualizations-and-models

In [None]:
df = pd.read_csv('/kaggle/input/predict-test-scores-of-students/test_scores.csv')
df.drop('student_id', axis=1, inplace=True)
df.describe(include='all')

In [None]:
plt.figure(figsize=(10,5))
sns.kdeplot(data=df['pretest'], shade=True, label='Pre-test')
sns.kdeplot(data=df['posttest'], shade=True, label='Post-test')
plt.title('Distribution of Pre Test and Post Test')
plt.legend()
plt.show()

The pre test and post test score disctribution seem to be very similar, but offset by a certain amount. So most of the time, students will improve their pre test score when it comes to the post test. 

In [None]:
fig= plt.figure(figsize=(30,10))
sns.boxplot(x='classroom', y='posttest', data=df.sort_values('posttest'))

The graph above shows each the scores distribution in each classroom. It shows that, in each class, there are rarely any "outstanding" scores, which means students perform similarly to their classmates. This  means that the classroom's features should be looked at further because it seems to stringly affect the students' score. Let's check the student features first through visualizations then look at the classroom features. 

In [None]:
fig , ax= plt.subplots(1, 2, figsize=(10, 5))
sns.boxplot(x='lunch', y='posttest', data=df, ax=ax[0])
sns.boxplot(x='gender', y='posttest', data=df, ax=ax[1])

The above graphs show how the student features (lunch and gender) affect post test scores. The general trend seems to be that students who do not qualify for subsidized lunches score better than those who don't. This may be showing the differnece in quality of education due to socioeconomic status/location. However there seems to be no correlation between gender and post test scores. Now let's take a look at the classroom features

In [None]:
sns.regplot(data=df, x='n_student', y='posttest')

There is a negative linear relationship between post test scores and the number of students in the classroom. 

In [None]:
f, axes = plt.subplots(1, 3,figsize=(10, 5))
sns.boxplot(data=df, x='teaching_method', y='posttest', ax=axes[0])
sns.boxplot(data=df, x='school_type', y='posttest', ax=axes[1])
sns.boxplot(data=df, x='school_setting', y='posttest', ax=axes[2])
plt.tight_layout()

We can see that all classroom features seem to influence post test scores of students. 

**Model Building**  
From our analysis, we found that basically all features have some sort of relationship with the post test scores *except gender*. Now, using the relevant features, we can use simple & multiple linear regression models and evaluate their performance to choose the most optimal model.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [None]:
# Prepare for model building

# Utility function for evaluating model performance
def eval(y, y_hat):
    MAE = mean_absolute_error(y, y_hat)
    MSE = mean_squared_error(y, y_hat)
    r2 = r2_score(y, y_hat)
    print(f'Mean Abs Error: {MAE:.2f}\nMean Square Error: {MSE:.2f}\nR^2 Error: {r2:.2f}')
    return (MAE, MSE, r2)

x = df[['pretest', 'n_student', 'school_setting', 'school_type', 'teaching_method', 'lunch']]
y = df[['posttest']]

# Encode categorical data
x = pd.get_dummies(x)

**Simple Linear Regression**  
Try and predict post test scores soley based off of pre test scores. 

In [None]:
x_simple = x[['pretest']]

x_train, x_test, y_train, y_test = train_test_split(x_simple, y, test_size = 0.4, random_state = 0)
model_linear = LinearRegression()

model_linear.fit(x_train, y_train)
y_hat = model_linear.predict(x_test)

eval(y_test, y_hat)

**Multiple Linear Regression**  
Predicting post scores based on all features except gender (becuase we saw during analysis that gender didn't have much overall affect on score). 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
model_linear.fit(x_train, y_train)
y_hat = model_linear.predict(x_test)

eval(y_test, y_hat)