In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from scipy import stats
from sklearn.metrics import r2_score
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Test score analysis

Reading the dataset,

In [None]:
test_scores= pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')
print(test_scores.head(5))

In [None]:
print(test_scores.info())

Dropping the unnecessary columns,

In [None]:
test_scores.drop(['classroom', 'n_student', 'student_id'], axis= 1, inplace= True)

Analysing the effect of school setting on student grades,

In [None]:
x = test_scores['school_setting']
y = test_scores['pretest']
plt.subplot(1, 2, 1)
plt.bar(x, y)
x = test_scores['school_setting']
y = test_scores['posttest']
plt.subplot(1, 2, 2)
plt.bar(x, y)

In [None]:
test_scores.groupby('school_setting').agg({'pretest':[max,min,'mean','mad'],'posttest':[max,min,'mean','mad']}).round(2)

**Thus, students in suburban school setting do better than students in rural and urban setting**

Analysing the effect of school type on student grades,

In [None]:
x = test_scores['school_type']
y = test_scores['pretest']
plt.subplot(1, 2, 1)
plt.bar(x, y)
x = test_scores['school_type']
y = test_scores['posttest']
plt.subplot(1, 2, 2)
plt.bar(x, y)

In [None]:
test_scores.groupby('school_type').agg({'pretest':[max,min,'mean','mad'],'posttest':[max,min,'mean','mad']}).round(2)

**The average grade of students in non-public schools is higher than average grade of students in public school**

Analysing the effect of gender on student grades,

In [None]:
x = test_scores['gender']
y = test_scores['pretest']
plt.subplot(1, 2, 1)
plt.bar(x, y)
x = test_scores['gender']
y = test_scores['posttest']
plt.subplot(1, 2, 2)
plt.bar(x, y)

In [None]:
test_scores.groupby('gender').agg({'pretest':[max,min,'mean','mad'],'posttest':[max,min,'mean','mad']}).round(2)

**Gender does not have any effect on student grades**

Analysing the effect of teaching method on student grades,

In [None]:
x = test_scores['teaching_method']
y = test_scores['pretest']
plt.subplot(1, 2, 1)
plt.bar(x, y)
x = test_scores['teaching_method']
y = test_scores['posttest']
plt.subplot(1, 2, 2)
plt.bar(x, y)

In [None]:
test_scores.groupby('teaching_method').agg({'pretest':[max,min,'mean','mad'],'posttest':[max,min,'mean','mad']}).round(2)

**Students being taught experimentally do better than students being taught standardly**

Analysing the effect of affordability of lunch on student grades,

In [None]:
x = test_scores['lunch']
y = test_scores['pretest']
plt.subplot(1, 2, 1)
plt.bar(x, y)
plt.xticks(rotation=30)

x = test_scores['lunch']
y = test_scores['posttest']
plt.subplot(1, 2, 2)
plt.bar(x, y)
plt.xticks(rotation=30)

In [None]:
test_scores.groupby('lunch').agg({'pretest':[max,min,'mean','mad'],'posttest':[max,min,'mean','mad']}).round(2)

**Students that do not qualify for reduced fee of lunch score more than students that qualify for reduced/free lunch**

# Test score prediction

Finding relation between posttest score and pretest score

In [None]:
x = test_scores['posttest']
y = test_scores['pretest']
plt.scatter(x, y)

## Test-Train method

In [None]:
x = test_scores['pretest']
y = test_scores['posttest'] 

train_x = x[:80]
train_y = y[:80]

test_x = x[20:]
test_y = y[20:]

train_pred = np.poly1d(np.polyfit(train_x, train_y, 1))

plt.scatter(train_x, train_y)
plt.show()

In [None]:
relation = r2_score(train_y, train_pred(train_x))

print(relation)

**This method DOES NOT show a strong relation (56.8)%**

## OLS Regression method

In [None]:
model = smf.ols('posttest ~ pretest', data = test_scores)
results = model.fit()
print(results.summary())

**R-squared shows a strong relation (90.4%)**

Creating a function for predicting test score:

In [None]:
def OLS_pred(pretest):
 return 0.9806*pretest + 13.2131

In [None]:
x = OLS_pred(test_scores['pretest'])
y = test_scores['posttest']
plt.scatter(x, y, alpha=0.5, s=10)

## Linear Regression method (Scipy stats)

In [None]:
x = test_scores['pretest']
y = test_scores['posttest']

slope, intercept, r, p, std_err = stats.linregress(x, y)

def linregress_pred(x):
  return slope * x + intercept

print(r)

**This shows a very strong relation (95.0%)**

In [None]:
x = linregress_pred(test_scores['pretest'])
y = test_scores['posttest']
plt.scatter(x,y, alpha=0.5, s=10)

For prediction by train-test, call **train_pred** (56.8% accuracy)  
For prediction by OLS Regression, call **OLS_pred** (90.4% accuracy)  
For prediction by Linear Regression call **linregress_pred** (95.0% accuracy)  