In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

# import standard libraries


import pandas as pd
import os
import sys
import numpy as np
import re

from IPython.core.display import display,HTML
display(HTML("<style>div.output_area pre {white-space: pre;}</style>"))
# %reload_ext sparksql_magic

pd.set_option('display.max_rows',None)

In [None]:
!pip install pyspark

In [None]:
from termcolor import colored, cprint
# import plotting libraries

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [None]:
pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings('ignore')

In [None]:
# train = pd.read_csv('../input/predict-test-scores-of-students')
df = spark.read.option("header",True).csv('../input/predict-test-scores-of-students')
df.show(3,False)

In [None]:
df = df.withColumn('n_student',col('n_student').cast('int'))\
.withColumn('pretest',col('pretest').cast('int'))\
.withColumn('posttest',col('posttest').cast('int'))
df.printSchema()

## Data Exploration

In [None]:
# show number of unique value in all columns except student id, pretest, posttest score
df_col_list = df.select([i for i in df.columns if i not in 
                       {'student_id','pretest','posttest'}])
for i in df_col_list.columns:
    print(colored('--Showing-- Column --> {}'.format(i) ,'blue', attrs=['reverse', 'blink']))
    print('* number of unique value in',i)
    print(df.select(i).distinct().count())

In [None]:
# show number of student and school by school setting
df.groupby(['school_setting'])\
.agg(countDistinct('student_id').alias('ttl_student')
    ,countDistinct('school').alias('ttl_school')
    ,avg('pretest').cast('decimal(12,2)').alias('avg_pre')
    ,avg('posttest').cast('decimal(12,2)').alias('avg_post'))\
.select('school_setting','ttl_student','ttl_school'
       ,((col('ttl_student')/col('ttl_school')).cast('decimal(12,2)').alias('avg_std/sch'))
       ,'avg_pre','avg_post')\
.show(100,False)

In [None]:

# show number of student and school by school type

df.groupby(['school_setting','school_type'])\
.agg(countDistinct('student_id').alias('ttl_student')
    ,countDistinct('school').alias('ttl_school'))\
.select('school_setting','school_type','ttl_student','ttl_school'
       ,((col('ttl_student')/col('ttl_school')).cast('decimal(12,2)').alias('avg_std/sch')))\
.orderBy('ttl_student', ascending=False)\
.show(100,False)

In [None]:
# show number of type of classroom by school setting, and type

df.groupby(['school_setting','school_type'])\
.agg(countDistinct('student_id').alias('ttl_student')
    ,countDistinct('school').alias('ttl_school')
    ,countDistinct('classroom').alias('ttl_clsrm'))\
.select('school_setting','school_type','ttl_student','ttl_school','ttl_clsrm')\
.orderBy('ttl_student', ascending=False)\
.show(100,False)

In [None]:
# show number of student and school by qualification for free lunch by locations

df.groupby(['school_setting','lunch'])\
.agg(countDistinct('student_id').alias('ttl_student')
    ,countDistinct('school').alias('ttl_school')
    ,avg('pretest').cast('decimal(12,2)').alias('avg_pre')
    ,avg('posttest').cast('decimal(12,2)').alias('avg_post'))\
.select('school_setting','lunch','ttl_student','ttl_school'
       ,'avg_pre','avg_post')\
.orderBy('ttl_student', ascending=False)\
.show(100,False)

In [None]:
# how many different teaching methodologies in each school
df.groupby(['school'])\
.agg(countDistinct('teaching_method').alias('num_teachmethod'))\
.orderBy('num_teachmethod', ascending=False)\
.show(100,False)


**Observation : Overall**

* Gender share equally propotion across dataset
* Urban location has more number of students and number of schools when comparing with other location type. However, Suburban has the highest average number of student per school(102 students/ school)
* 83% of all school have both Experimental and Standard teaching methodolody, except at UAGPU, ANKYI, KZKKE has only standard; and FBUMG has only Experimental teaching.
* public school has more variety type of class room than non-public school, and 74% of students study in public school
* ~57% of students are not eligible for free lunch, 38% of them study in Suburban schools.

In [None]:
# summary test score after study
df.withColumn('final',when(col('pretest')>col('posttest'),'lower')
             .when(col('pretest')==col('posttest'),'equal')
             .when(col('pretest')<col('posttest'),'higher'))\
.groupby(['final'])\
.agg(countDistinct('student_id').alias('ttl_std'))\
.orderBy('ttl_std', ascending=False)\
.show()

In [None]:
# show score growth by each dimension
col_list = ['school','school_setting','school_type','classroom','teaching_method','n_student','gender','lunch']
for i in col_list:
    df.groupby([i])\
    .agg(count(i).cast('decimal(12,2)').alias('ttl')
        ,avg('pretest').cast('decimal(12,2)').alias('avg_pre')
    ,avg('posttest').cast('decimal(12,2)').alias('avg_post'))\
    .select(i,'ttl','avg_pre','avg_post'
       ,((col('avg_post')-col('avg_pre'))/col('avg_pre')).cast('decimal(12,2)').alias('growth')
       ).orderBy('growth', ascending=False).show(100,False)


**Observation: pre-post score**
* 99% of students have a better score after study
* Even though students in Suburban tend to have a higher pre and post test score but _average score improvement are only at 19% growth, where the other two school setting increasing at 24%
* Student who study in Experimental teaching methodology yield +9% score improvement than standard teaching technique
* Where other variable tend to indicate different effect on posttest score development, Gender does not show significant impact on score development

**Visualisation**

In [None]:
sns.set_theme(style="darkgrid")

df = df.toPandas()
df.head()

# drop column "student_id" as student_id is an independent column
corr_df = df.drop(['student_id'],1)
corr_df.head()

In [None]:

#  creates the correlation matrix between all the features except for student id
col_list = ['school','school_setting','school_type','classroom','teaching_method','n_student'
            ,'gender','lunch','pretest','posttest']

for i in col_list:
    corr_df[i] = corr_df[i].astype('category').cat.codes
print(colored(' Show correlation between variables ', 'blue', attrs=['reverse', 'blink']))
corr_df.corr()

In [None]:
# plot between variables
sns.pairplot(corr_df, diag_kind='kde')

In [None]:

# strong connection between pretest score and post test score
print(colored(' Show correlation between variables - heatmap ', 'blue', attrs=['reverse', 'blink']))

plt.figure(figsize=(12,8))
sns.heatmap(corr_df.corr(),annot=True)


**Observation : correlation between variables**
* Strong connection between pretest and posttest score
* School, School_setting, School_type, classroom, n_student, gender, lunch - show similiar relationship score between pre and post. Only teaching method suggest more strong connection to posttest than pretest

In [None]:

# number of students are not eligible for free lunch
plt.figure(figsize=(12,8))
sns.countplot(x="school_setting", hue="lunch", data=df)

In [None]:

# Non-public school tend to have a higher pretest score
sns.lmplot(x='pretest',y='posttest',data = df,hue ='school_type',height= 6)

In [None]:
# How teching method effect pretest and posttest score
sns.lmplot(x='pretest',y='posttest',data = df,hue ='teaching_method',height= 8)

In [None]:
# given summary of post-test result, how it perform after pre- test
conditions = [
    (corr_df['pretest'] == corr_df['posttest']),
    (corr_df['pretest'] < corr_df['posttest']),
    (corr_df['pretest'] > corr_df['posttest'])]
choices = ['same','higher','lower']
corr_df['summary'] = np.select(conditions, choices)
corr_df.head()

from above information, 99% of students have a better post test score, hence remove those who has lower or qual to pretest score



In [None]:
# filtered out student who are outlier, assigned new df name as "clean_df"
clean_df = corr_df[corr_df['summary']=='higher']
# # drop gender, due to it shows no effect on post test
clean_df = clean_df.drop(['gender','summary'],axis=1)


plt.subplots(figsize=(12,8))
plt.xticks(np.arange(0, 80, step=5))
sns.set_theme(style="darkgrid")
sns.distplot( clean_df["posttest"])

# Model fitting

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [None]:
x = clean_df.drop(['posttest'],axis=1)
y = clean_df['posttest']

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state = 123)

**LinearRegression** As we see the pretest-posttest relation in a strightline, hence firstly try with linear regression



In [None]:
#LinearRegression
lr = LinearRegression()
#training the model on training data
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print('Accuracy - LinearRegression: %.2f%%'%(lr.score(X_test,y_test)*100))

**GradientBoostingRegressor**

In [None]:
#GradientBoostingRegressor
gradientBoost = GradientBoostingRegressor(random_state=1234,learning_rate=0.03, n_estimators=300)
gradientBoost.fit(X_train,y_train)


print('Accuracy - GradientBoostingRegressor: %.2f%%'%(gradientBoost.score(X_test, y_test)*100))

**RandomForestRegression**

In [None]:

#random forest regression

regr = RandomForestRegressor(max_depth=300, random_state=0, n_estimators=300)
regr.fit(X_train,y_train)

print('Accuracy - RandomForestRegression: %.2f%%'%(regr.score(X_test, y_test)*100))

**Multi-layer Perceptron regressor**

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.datasets import make_regression

X, y = make_regression(n_features=300, n_informative=5,random_state=0, shuffle=False)
#Multi-layer Perceptron regressor
clf =MLPRegressor(hidden_layer_sizes=1000, activation='relu', solver='adam', alpha=0.0001
      , batch_size='auto', learning_rate='adaptive', learning_rate_init=0.005)
clf.fit(X_train,y_train)

print('Accuracy - Multi-layer Perceptron regressor: %.2f%%'%(clf.score(X_test, y_test)*100))

In [None]:
ans = regr.predict(X_test)


In [None]:
# get predictive col
final_output = pd.DataFrame(ans,columns={"pred_posttest"})
#get true val col
y_test_df = pd.DataFrame(y_test, index=None,columns={"posttest"}).reset_index(drop=True)
# export output
pd.concat([final_output,y_test_df],axis=1).to_csv('./predict_testscore_output.csv')