In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
%matplotlib inline
data.hist(figsize=(10,7))
plt.show()

In [None]:
data.school.value_counts()

In [None]:
data.groupby("school")[["posttest", "pretest"]].mean()

In [None]:
data.drop("student_id", axis=1, inplace=True)
data.drop("classroom", axis=1, inplace=True)

In [None]:
data.head()

### Pretest seems to be the number one predictor of posttest

In [None]:
data["pretest_cat"] = pd.cut(data["pretest"], bins=[20., 40., 60., 80., np.inf], labels=[1, 2, 3, 4])
data["pretest_cat"].hist()
plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(test_size=0.1, n_splits=1, random_state=40)

for train_idx, test_idx in split.split(data, data["pretest_cat"]):
    train_data = data.loc[train_idx]
    test_data = data.loc[test_idx]


In [None]:
train_data["pretest_cat"].value_counts()/len(train_data["pretest_cat"])

In [None]:
test_data.drop("pretest_cat", axis=1, inplace=True)
train_data.drop("pretest_cat", axis=1, inplace=True)

In [None]:
scores = train_data.copy()

---

In [None]:
scores.info()

In [None]:
scores.corr()

In [None]:
scores.groupby("teaching_method")["posttest"].mean()

In [None]:
import seaborn as sns

fig, axes = plt.subplots(1,3, figsize=(15,7))
# fig.subtitle("")

#1
sns.kdeplot(ax=axes[0], x=scores.posttest, shade=True, hue=scores.teaching_method)
#2
sns.kdeplot(ax=axes[1], x=scores.posttest, shade=True, hue=scores.school_type)
#3
sns.kdeplot(ax=axes[2], x=scores.posttest, shade=True, hue=scores.school_setting)

plt.tight_layout()
plt.show()

### Prepare Data

In [None]:
scores_data = scores.drop("posttest", axis=1)
labels = scores["posttest"]

In [None]:
scores_num = scores_data[["n_student", "pretest"]]
scores_cat1 = scores_data.drop(["n_student", "pretest", "school"], axis=1)
scores_cat2 = scores_data[["school"]]
scores_cat1.info()

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.pipeline import Pipeline
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),  
                         ('std_scaler', StandardScaler())])


In [None]:
from sklearn.compose import ColumnTransformer
import category_encoders as ce
num_atts = list(scores_num)
cat_atts = list(scores_cat1)
cat2_atts = list(scores_cat2)
full_pipeline = ColumnTransformer([('num', num_pipeline, num_atts), 
                                   ('cat', OneHotEncoder(), cat_atts), 
                                   ('cat2', ce.BinaryEncoder(), cat2_atts)])

data_prepared = full_pipeline.fit_transform(scores_data)

In [None]:
data_prepared[1]

### Find a model

In [None]:
from sklearn.linear_model import LinearRegression

model_lr = LinearRegression()
model_lr.fit(data_prepared, labels)

In [None]:
labels_predicts = model_lr.predict(data_prepared[:5])

print("Predict: ", labels_predicts)
print("Actual: ", list(labels[:5]))

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(labels, model_lr.predict(data_prepared))
print(np.sqrt(mse))

In [None]:
from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

scores = cross_val_score(model_lr, data_prepared, labels, scoring="neg_mean_squared_error", cv=10)
mse = np.sqrt(-scores)
display_scores(mse)

### Another Model

In [None]:
from sklearn.tree import DecisionTreeRegressor

model_tree = DecisionTreeRegressor()
model_tree.fit(data_prepared, labels)

In [None]:
mse = mean_squared_error(labels, model_tree.predict(data_prepared))
print(np.sqrt(mse))

In [None]:
scores = cross_val_score(model_tree, data_prepared, labels, scoring="neg_mean_squared_error", cv=10)
mse = np.sqrt(-scores)
display_scores(mse)

### RandomForest

In [None]:
from sklearn.ensemble import RandomForestRegressor

model_rf = DecisionTreeRegressor()
model_rf.fit(data_prepared, labels)

scores = cross_val_score(model_rf, data_prepared, labels, scoring="neg_mean_squared_error", cv=10)
mse = np.sqrt(-scores)
display_scores(mse)

### Final model and Test

In [None]:
X_test = test_data.drop("posttest", axis=1)
y_test = test_data["posttest"]
X_test_prepared = full_pipeline.fit_transform(X_test)

mse = mean_squared_error(y_test, model_lr.predict(X_test_prepared))
print(np.sqrt(mse))