In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/predict-test-scores-of-students/test_scores.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.dropna().shape # -> there is no empty items in df

In [None]:
df.nunique().plot(kind = "bar")# 

In [None]:
tm = df["teaching_method"]
df.nunique()

In [None]:
tm.value_counts()

In [None]:
df["school"].value_counts()

In [None]:
df["lunch"].value_counts()

In [None]:
df2 = pd.get_dummies(df,columns=["school_setting"],prefix=["school_setting"])
df2

In [None]:
list1 = ["school_setting_Rural","school_setting_Suburban","school_setting_Urban"]

In [None]:
for item in list1:
    print(df2[item].corr(df2["posttest"])) 


In [None]:
# max results on tests
pretest = df["pretest"]
posttest = df["posttest"]
np.max(pretest), np.max(posttest)

In [None]:
avg_pre = pretest.mean()
avg_post = posttest.mean()

In [None]:
np.sum(pretest>avg_pre),np.sum(posttest > avg_post)

In [None]:
np.sum(pretest == 93) ,np.sum(posttest == 100)

In [None]:
df.iloc[np.argmax(pretest), :] # best pretest student

In [None]:
# best posttest students
indexnames = df[df["posttest"] != 100].index 
df.drop(indexnames) 

In [None]:
df.describe()


# **Visualisation**

In [None]:
c = 0
list2 = [df.school_setting,df.school_type,df.teaching_method,df.gender]
fig, axes = plt.pyplot.subplots(2, 2, figsize=(18, 10))
for i in range(2):
    for j in range(2):
        sns.countplot(ax=axes[i,j],x = list2[c])
        c+=1


In [None]:
fig,(ax1,ax2) = plt.pyplot.subplots(1,2,figsize=(12,6))
sns.set_style("darkgrid",{"axes.facecolor": ".9"})
sns.boxplot(x=df.pretest,y = df.gender,ax =ax1)
sns.histplot(x = df.pretest, ax =ax2,kde = True)

In [None]:
fig,(ax1,ax2) = plt.pyplot.subplots(1,2,figsize=(12,6))
sns.boxplot(x=df.posttest,y = df.gender,ax =ax1)
sns.histplot(x = df.posttest, ax =ax2,kde = True)

In [None]:
# seems, that pretest scores strongly correlate with posttest scores 
df[["pretest","posttest"]].plot(figsize = (35,15))

In [None]:
df["pretest"].corr(df["posttest"]) # we were right

# Machine learning part

In [None]:
# quick look at data again
df.head()

We got some categorial features, so have to encode them to continue 

In [None]:
# Split the data into X and y
X = df.drop(['posttest','classroom','student_id'], axis = 1)
y = df['posttest']

# Convert categorical values to numbers
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

categories = ['school', 'school_setting', 'school_type', 'teaching_method', 'gender', 'lunch']

one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categories)],
                                remainder = 'passthrough')

X_transformed = transformer.fit_transform(X)

# Split the transformed data to training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size = 0.2)

# Import the Random Forest regressor
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)

# Score the model
model.score(X_test, y_test)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_transformed,y,test_size = 0.18)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
model = RandomForestRegressor()
model.fit(X_train,y_train)
model.score(X_test,y_test)

In [None]:
cv_score = cross_val_score(model,X_transformed,y)
np.mean(cv_score)

# Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
X = df.drop(['posttest','classroom','student_id'], axis = 1)
y = df['posttest']

# Convert categorical values to numbers
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

categories = ['school', 'school_setting', 'school_type', 'teaching_method', 'gender', 'lunch']

one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categories)],
                                remainder = 'passthrough')

X_transformed = transformer.fit_transform(X)

# Split the transformed data to training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size = 0.2)

grid = {"n_estimators" : [1,10,50,100,200,500,800,1200,3000],'max_depth' : [None,5,10,20,30],'max_features' : ['auto', 'sqrt']}

tuned_model = RandomizedSearchCV(estimator = model,param_distributions = grid,n_iter=15,cv=7,verbose=2 )
tuned_model.fit(X_train,y_train)

In [None]:
tuned_model.best_params_

In [None]:
y_pred = tuned_model.predict(X_test)

In [None]:
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score, mean_squared_error
r2_score(y_test,y_pred)

In [None]:
mse = mean_squared_error(y_test, y_pred)
mse