In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from matplotlib import pyplot as plt
import plotly.express as px
import seaborn as sns
import warnings 
warnings.simplefilter("ignore")

### Read Datase

In [None]:
dataset_df = pd.read_csv("/kaggle/input/predict-test-scores-of-students/test_scores.csv")
dataset_df.head()

### Data exploration

In [None]:
dataset_df.isnull().sum()

In [None]:
dataset_df.duplicated().sum()

In [None]:
dataset_df.describe()

In [None]:
dataset_df.info()

In [None]:
dataset_df.columns

In [None]:
categorical_columns = ['school', 'school_setting', 'school_type', 'classroom', 'teaching_method', 'gender', 
                       'lunch', ]

numerical_columns = ['n_student', 'pretest', 'posttest']

In [None]:
print(dataset_df[categorical_columns[0]].unique())
print(dataset_df[categorical_columns[0]].value_counts()/len(dataset_df))

In [None]:
print(dataset_df[categorical_columns[1]].unique())
print(dataset_df[categorical_columns[1]].value_counts()/len(dataset_df))

In [None]:
print(dataset_df[categorical_columns[2]].unique())
print(dataset_df[categorical_columns[2]].value_counts()/len(dataset_df))

In [None]:
print(dataset_df[categorical_columns[3]].unique())
print(dataset_df[categorical_columns[3]].value_counts()/len(dataset_df))

In [None]:
print(dataset_df[categorical_columns[4]].unique())
print(dataset_df[categorical_columns[4]].value_counts()/len(dataset_df))

### Visualizationunique

In [None]:
print(dataset_df[categorical_columns[5]].unique())
print(dataset_df[categorical_columns[5]].value_counts()/len(dataset_df))

In [None]:
print(dataset_df[categorical_columns[6]].unique())
print(dataset_df[categorical_columns[6]].value_counts()/len(dataset_df))

In [None]:
sns.pairplot(dataset_df[numerical_columns+["gender"]], hue="gender")

In [None]:
gender_df = dataset_df.groupby(["gender"])["posttest"].mean().reset_index()
gender_df.plot.bar(x="gender", y='posttest', title='Dependence of gender on the mean test result')

In [None]:
school_df = dataset_df.groupby(["school_type"])["posttest"].mean().reset_index()
school_df.plot.bar(x="school_type", y='posttest', title='Dependence of the type of school on the mean test result')

In [None]:
teaching_method_df = dataset_df.groupby(["teaching_method"])["posttest"].mean().reset_index()
teaching_method_df.plot.bar(x="teaching_method", y='posttest', title='Dependence of the teaching method on the mean test result')

In [None]:
dataset_df.groupby(['school_setting'])["posttest"].sum().plot(kind='pie', subplots=True, shadow = True,startangle=90,
figsize=(15,10), autopct='%1.1f%%')

In [None]:
teaching_method_df = dataset_df.groupby(["n_student"])["posttest"].mean().reset_index()

colors = np.random.rand(len(teaching_method_df))
size = teaching_method_df["posttest"].values
plt.scatter(teaching_method_df["n_student"], teaching_method_df["posttest"], s=size, c=colors)

In [None]:
dataset_df.columns

In [None]:
corr_df = dataset_df.drop(['school','student_id','classroom'], axis=1)

corr_columns = {'school_setting':{'Urban':0, 'Suburban':1, 'Rural':2},
               'school_type':{'Public':0, 'Non-public':1},
               'teaching_method':{'Standard':0, 'Experimental':1},
               'lunch':{'Does not qualify':0, 'Qualifies for reduced/free lunch':1},
               'gender':{'Female':0, 'Male':1}}

corr_df = corr_df.replace(corr_columns)

corr_df.head()

In [None]:
corr = corr_df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
dataset_df.head()

### Preparing Dataset for model

In [None]:
dataset_df = dataset_df.drop(["student_id", "school", "classroom"], axis=1)
dataset_df.head()

In [None]:
dataset_df.info()

In [None]:
X = dataset_df.iloc[:,0:8].values
y = dataset_df.iloc[:, -1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, shuffle=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

col0_LE = LabelEncoder()
X_train[:,0] = col0_LE.fit_transform(X_train[:,0])

col1_LE = LabelEncoder()
X_train[:,1] = col1_LE.fit_transform(X_train[:,1])

col2_LE = LabelEncoder()
X_train[:,2] = col2_LE.fit_transform(X_train[:,2])

col4_LE = LabelEncoder()
X_train[:,4] = col4_LE.fit_transform(X_train[:,4])

col5_LE = LabelEncoder()
X_train[:,5] = col5_LE.fit_transform(X_train[:,5])

In [None]:
X_test[:,0] = col0_LE.transform(X_test[:,0])

X_test[:,1] = col1_LE.transform(X_test[:,1])

X_test[:,2] = col2_LE.transform(X_test[:,2])

X_test[:,4] = col4_LE.transform(X_test[:,4])

X_test[:,5] = col5_LE.transform(X_test[:,5])

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
y_train = y_train.reshape(-1,1).astype('float32')
y_test = y_test.reshape(-1,1).astype('float32')

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Building the models

In [None]:
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error

def print_score(y_pred, y_real):
    print("mean_squared_error:", mean_squared_error(y_pred, y_real))
    print("r2_score:", r2_score(y_pred, y_real))
    print("explained_variance_score:", explained_variance_score(y_pred, y_real))   

In [None]:
from sklearn.model_selection import GridSearchCV

def get_trained_grid(model, grid_params, x_train, y_train ,refit=True, cv=10, verbose=1):
    grid = GridSearchCV(model, grid_params, refit=refit, cv=cv, verbose=verbose)
    grid.fit(x_train, y_train)
    return grid

In [None]:
def get_grid_best_params(grid):
    print(grid.best_params_)
    print(grid.best_estimator_)

In [None]:
def print_grid_performance(grid, x_test, y_test):
    y_pred = grid.predict(x_test)
    print_score(y_pred, y_test)

In [None]:
from sklearn.linear_model import SGDClassifier

%time
grid_params = { "loss": ["hinge", "log", "modified_huber"],
               "penalty": ["l1", "l2", "elasticnet"]   
}

grid = get_trained_grid(SGDClassifier(), grid_params, X_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, X_test, y_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

%time
grid_params = { "n_neighbors": np.arange(1,50)}

grid = get_trained_grid(KNeighborsClassifier(), grid_params, X_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, X_test, y_test)

In [None]:
from sklearn.naive_bayes import GaussianNB

%time
grid_params = { "var_smoothing": [1e-09] }

grid = get_trained_grid(GaussianNB(), grid_params, X_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, X_test, y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier

%time
grid_params = {'criterion': ["gini", "entropy"], 
              'splitter': ['best', 'random'], 
              'max_depth': [3,4,None], 
              'min_samples_split':[2, 4, 6],
              'min_samples_leaf':[1,2,3]}

grid = get_trained_grid(DecisionTreeClassifier(), grid_params, X_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

%time
grid_params = {'n_estimators': [10, 20, 50], 
              'max_features': ['auto', 'sqrt', 'log2'], 
              'bootstrap': [True, False], 
              'criterion':['entropy', 'gini']}

grid = get_trained_grid(RandomForestClassifier(), grid_params, X_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, X_test, y_test)

In [None]:
from sklearn import svm

%time
grid_params = { "kernel": ["linear", "poly", "rbf", "sigmoid"],
               "degree": [1, 2 ,3, 4, 5, 6] }

grid = get_trained_grid(svm.SVR(), grid_params, X_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, X_test, y_test)

In [None]:
from sklearn.naive_bayes import BernoulliNB

%time
grid_params = {'alpha': [0.25, 0.5, 1]}

grid = get_trained_grid(BernoulliNB(), grid_params, X_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, X_test, y_test)

In [None]:
from xgboost import XGBClassifier

%time
grid_params = {'learning_rate': [0.01, 0.05, 0.1], 
              'eval_metric': ['error']}

grid = get_trained_grid(XGBClassifier(), grid_params, X_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, X_test, y_test)

In [None]:
!pip install livelossplot

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.model_selection import cross_val_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input

def build_model(optimizer="adam"):
    model = Sequential()
    model.add(Dense(32, input_dim=X_train.shape[1], activation="sigmoid"))
    model.add(Dropout(0.5))
    model.add(Dense(16, activation="sigmoid"))
    model.add(Dropout(0.5))
    model.add(Dense(8, activation="sigmoid"))
    model.add(Dense(1))
    model.compile(optimizer=optimizer, loss="mean_squared_error", metrics=["mse"])
    return model

In [None]:
model = build_model()
model.summary()

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from livelossplot import PlotLossesKerasTF

batch_size = 16
epochs = 200

checkpoint = ModelCheckpoint("Best_model_params.h5", monitor='val_loss')
es = EarlyStopping(monitor='val_loss', mode='min', patience=10)
callbacks = [PlotLossesKerasTF(), es, checkpoint]

history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    callbacks=callbacks,
                    validation_data=(X_test, y_test),
                    verbose=1)

In [None]:
yhat = model.predict(X_test)
print_score(yhat, y_test)