In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/predict-test-scores-of-students/test_scores.csv')
data.head(5)

In the table bellow, It's possible to see that <b>most of the students</b> in the dataset are from <b>public schools</b>.

In [None]:
pd.crosstab(data['school_setting'],data['school_type'], normalize='index')

In [None]:
pd.crosstab(data['school_type'],data['lunch'], normalize='index')

<font size="3"> The table bellow shows some interesting information. For this dataset, the highest <b>scores</b> are seen for students from private schools that doesn't serve lunch. </font>   

In [None]:
pd.pivot_table(data, values='posttest', index=['school_type'],
                    columns=['lunch'], aggfunc=np.mean)

In [None]:

f, axes = plt.subplots(1, 3,figsize=(15,8))
sns.boxplot(data=data, x='teaching_method', y='posttest', ax=axes[0])
sns.boxplot(data=data, x='lunch', y='posttest', ax=axes[1])
sns.boxplot(data=data, x='school_type', y='posttest', ax=axes[2])
plt.tight_layout()
plt.show()


In [None]:
f, axes = plt.subplots(1, 2,figsize=(9,8))
sns.boxplot(data=data, x='school_setting', y='posttest', ax=axes[0])
sns.boxplot(data=data, x='gender', y='posttest', ax=axes[1])
plt.tight_layout()

In [None]:
post_desc = data['posttest'].describe()
pre_desc = data['pretest'].describe()

dif = post_desc['mean']-pre_desc['mean']
dif

In [None]:

plt.figure(figsize=(10,5))
sns.kdeplot(data=data['pretest'], shade=True, label='Pre-test')
sns.kdeplot(data=data['posttest'], shade=True, label='Post-test')
plt.title('Distribution of Pre and Post Tests')
plt.axvline(x=post_desc['mean'], linestyle='--', color='orange',label='Post test mean')
plt.axvline(x=pre_desc['mean'], linestyle='--', color='#5D8BBA',label='Pre test mean')
plt.legend()
plt.show()

In [None]:

plt.figure(figsize=(10,5))
sns.scatterplot(data=data, x='pretest',y='posttest', hue='school_setting', s=18)

In [None]:
plt.figure(figsize=(10,5))
plt.title('Number of students in class vs post test scores ')
sns.boxplot(data=data, x='n_student', y='posttest', hue='school_type')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


<font size="6"> Models for post test scores </font>

<font size="3"> Making a function to print and create a dictionary with <b>errors</b> and <b>r2 score</b>: </font>

In [None]:
def evaluating(y_act,y_pred):
    MAE = mean_absolute_error(y_act, y_pred)
    MSE = mean_squared_error(y_act, y_pred)
    r2 = r2_score(y_act, y_pred)
    eval_values = {'MAE': round(MAE,3), 'MSE' : round(MSE,3), 'r2' : round(r2,3)}
    print('Results:\n Mean absolute error = ', round(MAE,3),'\n Mean squared error = ', round(MSE,3),'\n R2 = ', round(r2,3))

    return eval_values

<font size="3"> Using the variables that appear to influence the post test values, from observing the different box plots. Using dummies for the qualitative variables</font>

In [None]:
X = data[['pretest','n_student','school_setting','school_type','teaching_method','lunch']]
Y = data[['posttest']]
X = pd.get_dummies(X)
X.head(3)

<font size="5"> Linear Regression model </font>

<font size="3"> Using only the values of <b>pre test</b> to predict post test. </font>

In [None]:
Xlin = X[['pretest']]
[x_adj, x_tst, y_adj, y_tst] = train_test_split(Xlin,Y,test_size=0.35, random_state=5)

model_linear= LinearRegression()
model_linear.fit(x_adj, y_adj)

y_pred = model_linear.predict(x_tst)

only_pret_eval = evaluating(y_tst,y_pred)

<font size="5"> Multiple Linear Regression model </font>

<font size="3"> Model using the two quantitative variables <b>pre test</b> and <b>number of students in class</b> to predict post tests.</font>

In [None]:
Xlin = X[['pretest','n_student']]
[x_adj, x_tst, y_adj, y_tst] = train_test_split(Xlin,Y,test_size=0.35, random_state=5)

model_linear= LinearRegression()
model_linear.fit(x_adj, y_adj)

y_pred = model_linear.predict(x_tst)

pret_n_eval = evaluating(y_tst,y_pred)


<font size="3"> Model using the <b>following variables</b>:
1. Pre test
2. number of students in classroom
3. School Setting
4. School Type
5. Teaching Method
6. Lunch
    </font>

In [None]:
[x_adj, x_tst, y_adj, y_tst] = train_test_split(X,Y,test_size=0.35, random_state=5)

model_linear= LinearRegression()
model_linear.fit(x_adj, y_adj)

y_pred = model_linear.predict(x_tst)

multi_eval = evaluating(y_tst,y_pred)

<font size="5"> Thanks for reading =D </font>