In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from pylab import rcParams
import warnings
warnings.filterwarnings('ignore')

# Data Exploration

In [None]:
data = pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')

In [None]:
data.head()

In [None]:
data.describe(include='all')

It seems like there is no null values since the count value of every columns are the same. Just in case, 

In [None]:
data.isna().sum()

In [None]:
data.nunique()

# **Data Visualization**

* *School*

> Shows how many students they have for each School within this data

In [None]:
data['school'].value_counts()

> The Average Posttest school for each school

In [None]:
scores = data.groupby('school')['posttest'].mean()
plt.figure(figsize=(24,8))
sns.barplot(x=data['school'].unique(), y=scores.values)

> The minimum number of students who go to certain school was 41 and I thought it was a fair number that would not hurt the purpose of average posttest score. <br> <br>It clearly shows that each school has it's own unique value of posttest scores. School can be a factor that affects the posttest prediction <br><br> Planning to do 'One-hot encoding' for this feature

* *School Setting & School Type Visualization*

> Ratio

In [None]:
setting = data.groupby('school_setting')['school_type'].value_counts().to_frame()
setting.columns = ['Count']
df1 = setting.reset_index(level=[0,1])
fig = px.sunburst(df1, path=['school_setting', 'school_type'], values='Count', color_discrete_sequence=px.colors.sequential.Blackbody,
                 height=400, width=500)
fig.show()

* *Other Categorical Features Visualization (Average posttest scores)*

> I thought the type of Classroom does not affect the posttest prediction, so I excluded from the categorical features list

In [None]:
slist = ['school_setting', 'school_type', 'teaching_method', 'gender', 'lunch']

In [None]:
fig, ax = plt.subplots(2,3,figsize=(15,10))

def graph(x, i):
    sns.barplot(x=data[x].unique(), y=data.groupby(x)['posttest'].mean().values, ax=i, palette='nipy_spectral')
    i.set_title(x+' (Average Posttest Score)', fontsize=11, fontdict={"fontweight": "bold"})
    
    for p in i.patches:
        text = str(int(p.get_height()))
        i.annotate(text, (p.get_x()+p.get_width()/2, p.get_height()+1),
                   ha="center", va='center', fontsize=10, fontweight="bold")

graph('school_setting', ax[0,0])
graph('school_type', ax[0,1])
graph('teaching_method', ax[0,2])
graph('gender', ax[1,0])
graph('lunch', ax[1,1])
fig.delaxes(ax= ax[1,2]) 

> Except for the 'gender' feature, the Average Posttest score is different for each column values, so I think these features are useful for posttest prediction

* *Number of Students in Class*

In [None]:
sns.lineplot(x=data.groupby('n_student')['posttest'].mean().index, 
             y=data.groupby('n_student')['posttest'].mean())

> It is interesting to see that the number of students in the class is inversely proportional to the average posttest score. We can imply from this graph that students in less number of students in class tends to get higher posttest grade than that of more number of students in class since they can easily communicate with a professor, get individual help or feedback from their assignments and have a tight relationship with a professor.

* *Classroom Visualization*

In [None]:
cl_scores = data.groupby('classroom')['posttest'].mean()
plt.figure(figsize=(24,8))
sns.barplot(x=data['classroom'].unique(), y=cl_scores.values)

> It looks like the type of classroom affects the posttest score, I might need to include this feature to predict the posttest score. <br> <br> Since this feature has high cardinality, I am planning to imply TargetEncoder on this feature which deals with categorical feature with high cardinality.

# **Data Preprocessing**

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from numpy import asarray
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder

> Dropping **'student_id'**, and **'gender'** features. **'student_id'** works as an index and **'gender'** feature does not affect the posttest grade as we can see it from the graph above.

In [None]:
X = data.drop(['student_id', 'gender', 'posttest'], axis=1)
y = data['posttest']
display(X.head())
display(y.head())

* *Frequency Encoding ('School', 'Classroom' features)*

In [None]:
f_list = ['school', 'classroom']
for x in f_list:
    encoding = X.groupby(x).size()
    encoding = encoding/len(X)
    X[x] = X[x].map(encoding)

X.head()

* *Binary Encoding('school_setting', 'school_type', 'teaching_method', 'lunch')*

In [None]:
b_list = ['school_setting', 'school_type', 'teaching_method', 'lunch']
b_encoder = preprocessing.LabelBinarizer()
for x in b_list:
    a = b_encoder.fit_transform(X[x])
    X[x] = a
X.head()

* *Standard Scaling*

In [None]:
scale_col = ['n_student', 'pretest']
for l in scale_col:
    sc = StandardScaler()
    scaled_X = sc.fit_transform(asarray(X[l]).reshape(-1,1))
    X[l] = scaled_X
X.head()

# **Machine Learning**

In [None]:
from sklearn.linear_model import LogisticRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from xgboost import XGBRegressor
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, roc_curve, roc_auc_score, r2_score, mean_squared_error
from sklearn import metrics

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

In [None]:
model1 = LogisticRegression(max_iter=1000)
model1.fit(X_train, y_train)
pred1 = model1.predict(X_test)
acc1 = r2_score(y_test, pred1)
mae1 = mean_absolute_error(y_test, pred1)
mse1 = mean_squared_error(y_test, pred1)
print('Accuracy: {:.2f} \nMAE: {:.2f} \nMSE: {:.2f}'.format(acc1*100, mae1, mse1))

In [None]:
model2 = KNeighborsRegressor()
model2.fit(X_train, y_train)
pred2 = model2.predict(X_test)
acc2 = r2_score(y_test, pred2)
mae2 = mean_absolute_error(y_test, pred2)
mse2 = mean_squared_error(y_test, pred2)
print('Accuracy: {:.2f} \nMAE: {:.2f} \nMSE: {:.2f}'.format(acc2*100, mae2, mse2))

In [None]:
model3= DecisionTreeRegressor(max_depth=10, min_samples_leaf=15)
model3.fit(X_train, y_train)
pred3 = model3.predict(X_test)
acc3 = r2_score(y_test, pred3)
mae3 = mean_absolute_error(y_test, pred3)
mse3 = mean_squared_error(y_test, pred3)
print('Accuracy: {:.2f} \nMAE: {:.2f} \nMSE: {:.2f}'.format(acc3*100, mae3, mse3))

In [None]:
model4 = RandomForestRegressor()
model4.fit(X_train, y_train)
pred4 = model4.predict(X_test)
acc4 = r2_score(y_test, pred4)
mae4 = mean_absolute_error(y_test, pred4)
mse4 = mean_squared_error(y_test, pred4)
print('Accuracy: {:.2f} \nMAE: {:.2f} \nMSE: {:.2f}'.format(acc4*100, mae4, mse4))

In [None]:
model5 = SVC()
model5.fit(X_train, y_train)
pred5 = model6.predict(X_test)
acc5 = r2_score(y_test, pred5)
mae5 = mean_absolute_error(y_test, pred5)
mse5 = mean_squared_error(y_test, pred5)
print('Accuracy: {:.2f} \nMAE: {:.2f} \nMSE: {:.2f}'.format(acc5*100, mae5, mse5))

In [None]:
model6 = XGBRegressor()
model6.fit(X_train, y_train)
pred6 = model6.predict(X_test)
acc6 = r2_score(y_test, pred6)
mae6 = mean_absolute_error(y_test, pred6)
mse6 = mean_squared_error(y_test, pred6)
print('Accuracy: {:.2f} \nMAE: {:.2f} \nMSE: {:.2f}'.format(acc6*100, mae6, mse6))

# **Model Comparison Table**

In [None]:
acc_table = pd.DataFrame({'Model': ['Logistic Regression',
                                   'KNN',
                                   'Decision Tree',
                                   'Random Forest Tree',
                                   'SVC',
                                   'XGB'],
                         'Accuracy Score': [acc1,
                                           acc2,
                                           acc3,
                                           acc4,
                                           acc5,
                                           acc6]})
acc_table = acc_table.sort_values(by='Accuracy Score', ascending=False)
acc_table.style.background_gradient(cmap='Blues')

In [None]:
error_table = pd.DataFrame({'Model': ['Logistic Regression',
                                   'KNN',
                                   'Decision Tree',
                                   'Random Forest Tree',
                                   'SVC',
                                   'XGB'],
                         'MSE': [mse1,
                                           mse2,
                                           mse3,
                                           mse4,
                                           mse5,
                                           mse6],
                           'MAE': [mae1,
                                           mae2,
                                           mae3,
                                           mae4,
                                           mae5,
                                           mae6]})
error_table = error_table.sort_values(by='MSE', ascending=True)
error_table.style.background_gradient(cmap='Blues')

# **Best Model Parameter Tuning (KNN)**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV


cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

knn = KNeighborsClassifier()

space = dict()
space['n_neighbors'] = [4,5,6,7,8,10]
space['weights'] = ['uniform', 'distance']
space['leaf_size'] = [10,20,30,40,50]
space['algorithm'] = ['auto', 'ball_tree', 'kd_tree', 'brute']

search = GridSearchCV(knn, space, scoring='r2', n_jobs=-1, cv=cv)
result = search.fit(X_train,y_train)
print('Best Score: %s' %result.best_score_)
print('Best HyperParameters: %s' %result.best_params_)

In [None]:
t_model = KNeighborsRegressor(leaf_size=45, n_neighbors=8, n_jobs=-1, p=1)
t_model.fit(X_train, y_train)
t_pred = t_model.predict(X_test)
t_acc = r2_score(y_test, t_pred)
t_mae = mean_absolute_error(y_test, t_pred)
t_mse = mean_squared_error(y_test, t_pred)
print('Accuracy: {:.2f} \nMAE: {:.2f} \nMSE: {:.2f}'.format(t_acc*100, t_mae, t_mse))
print('Improvement \nAcc: {:.2f} \nMAE: {:.2f} \nMSE: {:.2f}'.format(t_acc*100-acc3*100, t_mae - mae3, t_mse - mse3))

> By changing parameters with the best parameters I got from the GridSearchCV, my model did not improve so I decided to change parameters on my own just in case. I do not know why my GridSearchCV did not work...

# **KNN Visualization**

> Since KNN cannot be visualized in multidimension, we need to find out two features that are correlated the most <br>

> Feature to Feature Correlations - Higher value indicates simillarity of both two features. Therefore, the less value the better.

> Feature to Outcome Correlations - Higher value indicates the importance of feature

In [None]:
h_data = X.join(data['posttest'].to_frame())
plt.figure(figsize=(15,8))
mask = np.triu(np.ones_like(h_data.corr(), dtype=np.bool))
sns.heatmap(data=h_data.corr(),annot=True,cmap='BrBG',mask=mask)

> Chose feature lunch and pretest because eventhough feature to feature correlation is -0.62 which is pretty high, both of their feature to outcome correlations are decently high (-0.6, 0.95)

In [None]:
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score, classification_report
# filter warnings
warnings.filterwarnings("ignore")

def accuracy(k, ls, nj, p, X_train, y_train, X_test, y_test):
    # instantiate learning model and fit data
    knn = KNeighborsRegressor(n_neighbors=k, leaf_size=ls, n_jobs=nj, p=p)    
    knn.fit(X_train, y_train)

    # predict the response
    pred = knn.predict(X_test)

    # evaluate and return  accuracy
    return r2_score(y_test, pred)

def classify_and_plot(X, y):
    ''' 
    split data, fit, classify, plot and evaluate results 
    '''
    # split data into training and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

    # init vars
    n_neighbors = 8
    leaf_size=45
    n_jobs=-1
    p=1
    h           = .02  # step size in the mesh

    # Create color maps
    cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
    cmap_bold  = ListedColormap(['#FF0000', '#0000FF'])

    rcParams['figure.figsize'] = 5, 5
        
    clf = KNeighborsClassifier(n_neighbors, leaf_size=leaf_size, n_jobs=n_jobs, p=p)
    clf.fit(X_train, y_train)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                        np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

        # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    fig = plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

        # Plot also the training points, x-axis = 'Glucose', y-axis = "BMI"
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)   
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("0/1 outcome classification (k = %i)" % (n_neighbors))
    plt.show()

        # evaluate
    y_expected  = y_test
    y_predicted = clf.predict(X_test)

        # print results
    print('----------------------------------------------------------------------')
    print('Accuracy = {:.2f}'.format(accuracy(n_neighbors, leaf_size, n_jobs, p, X_train, y_train, X_test, y_test)*100))
    print('----------------------------------------------------------------------')

# we only take the best two features and prepare them for the KNN classifier
rows_nbr = 2133 # data.shape[0]
X_prime  = np.array(h_data.iloc[:rows_nbr, [6,7]])
X        = X_prime # preprocessing.scale(X_prime)
y        = np.array(h_data.iloc[:rows_nbr, 8])

# classify, evaluate and plot results
classify_and_plot(X, y)

> With only two features (lunch, pretest), the accuracy for KNN model was around 90% which is pretty high