In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/predict-test-scores-of-students/test_scores.csv')

# 1. Data visualization

In [None]:
data.info()

In [None]:
data.tail()

In [None]:
data[data['n_student'] == 14].n_student.describe()

In [None]:
data.describe()

## 1.1. General view (pre-test and post-test)

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(121)
sns.histplot(x=data.pretest, kde=True).set_title('Pre-test score distribution')

plt.subplot(122)
sns.histplot(x=data.posttest, kde=True).set_title('Post-test score distribution')

plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.kdeplot(data=data[['pretest','posttest']], shade=True).set_title('Test comparison')
plt.show()

## 1.2. Visualization by school

In [None]:
plt.figure(figsize=(20,5))
sns.countplot(x=data['school']).set_title('School distribution')
plt.show()

In [None]:
mean = data.groupby('school').mean().sort_values(by='posttest', ascending=False).posttest

plt.figure(figsize=(20,5))
sns.barplot(x=mean.index, y=mean, palette= 'coolwarm').set_title('Performance by average score')
plt.ylabel('Score')
plt.show()

## 1.3. Visualization by school setting and school type

In [None]:
data['school_setting'].unique()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,6))

ax1 = sns.countplot(x=data['school_setting'], ax=ax1)
ax1.set_title('School setting distribution')

ax2 = sns.countplot(x=data['school_type'], ax=ax2)
ax2.set_title('School type distribution')

plt.show()

In [None]:
f, ax = plt.subplots(1,2, figsize=(17,6))

sns.pointplot(data=data, x='school_type', y='pretest', hue='school_setting', order=['Public', 'Non-public'], ax=ax[0])
ax[0].set_title('Average pre-test score by school type and setting')
ax[0].set_ylim(ymin=40, ymax=80)

sns.pointplot(data=data, x='school_type', y='posttest', hue='school_setting', order=['Public', 'Non-public'], ax=ax[1])
ax[1].set_title('Average post-test score by school type and setting')
ax[1].set_ylim(ymin=40, ymax=80)

ax[0].grid()
ax[1].grid()
plt.show()

## 1.4. Visualization by classroom

In [None]:
mean = data[['classroom', 'pretest', 'posttest']].groupby('classroom').mean()

## Increase score analysis

In [None]:
mean['increase'] = (mean.posttest - mean.pretest)

### Top 10 improvements

In [None]:
top_10 = mean.sort_values(by='increase', ascending=False).head(10)
worst_10 = mean.sort_values(by='increase', ascending=False).tail(10)

f, ax = plt.subplots(2,1,sharex=True, figsize=(10,7))

plt.subplot(211)
ax[0] = sns.barplot(data=top_10, x='increase', y=top_10.index).set_title('Top 10 classroom improvements')

plt.subplot(212)
ax[1] = sns.barplot(data=worst_10.tail(10), x='increase', y=worst_10.index).set_title('10 worst classroom improvements')

plt.tight_layout()
plt.show()

## 1.5. Visualization by teaching method

In [None]:
mean = data.groupby('teaching_method', as_index=False).mean().drop('n_student', axis=1)
mean


In [None]:
table = pd.DataFrame({'teaching_method': ['Experimental', 'Standard', 'Experimental', 'Standard'], 'type': ['pretest', 'pretest', 'posttest', 'posttest'],
             'score': [57.055263, 53.793882, 72.982895, 63.847050]})

In [None]:
plt.figure(figsize=(5,7))
sns.set_style(style='whitegrid')
sns.pointplot(data=table, x='type', y='score' , hue='teaching_method').set_title('Teaching method performances')
plt.show()

## 1.6. Visualization by gender

In [None]:
mean = data.groupby('gender').mean()
mean

In [None]:
plt.figure(figsize=(4,5))
sns.barplot(data=mean, x=mean.index, y='n_student').set_title('Gender distribution')
plt.show()

# 2. Data preprocessing

In [None]:
data.set_index('student_id', inplace=True)

X = data.drop('posttest', axis=1)
Y = data['posttest']

In [None]:
data.head()

### Labeling categorical variables

In [None]:
from sklearn.preprocessing import LabelEncoder

label_variables = ['school', 'school_setting', 'school_type', 'classroom', 'teaching_method', 'gender', 'lunch']

for col in X[label_variables]:
    label_encoder = LabelEncoder()
    X[col] = label_encoder.fit_transform(X[col])

lblX = X

X

### Applying OneHotEncoder

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

onehotencoder = OneHotEncoder()

col_transformer = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [0,1,2,3,4,6,7])], remainder='passthrough')

X = col_transformer.fit_transform(X).toarray()

### Splitting training and test data

In [None]:
from sklearn.model_selection import train_test_split

tX = X
tY = Y
train_x, test_x, train_y, test_y = train_test_split(tX,tY,test_size=0.2, random_state=1)

train_x.shape, train_y.shape, test_x.shape, test_y.shape

## Data machine learning

In [None]:
scores = []
maes = []

### 1. Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(train_x, train_y)

lrPred = lr.predict(test_x)

pd.DataFrame({'Value': test_y, 'Predict': lrPred.ravel()})

In [None]:
s = lr.score(test_x, test_y)
scores.append(s)
s

In [None]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(test_y, lrPred)
maes.append(mae)
mae

#### Linear regression
#### score: 0.96
#### mae: 2.29

### 2. Decision tree regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

treeReg = DecisionTreeRegressor()
treeReg.fit(train_x, train_y)
s = treeReg.score(test_x, test_y)
scores.append(s)
s

In [None]:
lrPred = treeReg.predict(test_x)

mae = mean_absolute_error(test_y, lrPred)
maes.append(mae)
mae

#### Decision tree regressor
#### score: 0.93
#### mae: 3.04

### 3. Random forest regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(train_x, train_y)
s = rf.score(test_x, test_y)
scores.append(s)
s

In [None]:
lrPred = rf.predict(test_x)
mae = mean_absolute_error(test_y, lrPred)
maes.append(mae)
mae

#### Random forest regressor
#### score: 0.95
#### mae: 2.58

## Algorithms overview

In [None]:
pd.DataFrame({'Score': scores, 'MAE': maes}, index=['Linear Regression', 'Decision Tree Regressor', 'Random Forest Regressor'])