# Inspiration

Inspired and motivated by: https://www.kaggle.com/oscardavidperilla/regression-for-note-predictions

# Importing the libraries

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("dark_background")

import tensorflow as tf

from sklearn.metrics import explained_variance_score, mean_absolute_error
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Loading the data

In [None]:
df = pd.read_csv("../input/predict-test-scores-of-students/test_scores.csv")
df.head()

# Exploring the data

Knowledge about the data you are working on is very important for data analysis.

What I have used:

- [pandas.DataFrame.info](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.info.html)

- [pandas.DataFrame.isna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.isna.html)

- [pandas.DataFrame.describe](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html)

- [pandas.DataFrame.columns](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.columns.html)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
df.columns

# Dropping columns

By looking at the data we can see that the features `school`, `classroom` and `student_id` doesn't have any effect on our label `posttest`, so we can safely drop them using [pandas.DataFrame.drop](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html)

In [None]:
df.drop(["school", "classroom", "student_id"], axis=1, inplace=True)
df.columns

# Exploring Categorical Data

In [None]:
for x in ["school_setting", "school_type", "teaching_method", "gender"]:
    for val in df[x].unique():
        count = df[x].value_counts()[val]
        percent = df[x].value_counts(normalize=True)[val] * 100
        print(f"{val} - Count: {count}, Percentage: {percent:.2f}%")
    print()

# Visualizing

In [None]:
corr = df.drop("posttest", axis=1).corr()
 
sns.heatmap(corr, annot=True, linewidth=.6, linecolor="black")
plt.show()

In [None]:
sns.pairplot(df, hue="gender", palette="Set2")
plt.show()

# One Hot Encoding

Most of the machine learning models don't like text. What do they like? Numbers. **One Hot Encoding** is an important part of **feature engineering** which is used to convert categorial data so that they can be provided to our machine learning model.

Example: If we have two colors `red` and `green` and we want to represent `red` we could do something like

```
+---+-----+
|red|green|
+---------+
| 1 |  0  |
+---+-----+
```
These are often referred to as "dummy variables".


We can do this by using [pandas.get_dummies](https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html)

Alternate: [sklearn.preprocessing.OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)

Extra Resource: https://www.educative.io/blog/one-hot-encoding

In [None]:
df = pd.get_dummies(df)
df.head()

In [None]:
# Renaming last two column to not have spaces between words 
df.rename(columns={
                    df.columns[-2]: '_'.join(df.columns[-2].split()),
                    df.columns[-1]: '_'.join(df.columns[-1].split()),
                   }, inplace=True)
df.head()

# Building Regression Models

## Creating features (X) and label (y)

Features are often referred to as "independent variables" and Label is often referred to as "dependent variable".

Here `posttest` is our label because it depends on other features.

In [None]:
X = df.drop("posttest", axis=1)
y = df["posttest"]

## Splitting the data in training and testing set

- Training data set is used for fitting our model to learn the patterns.
- Testing data set is used for prediction and unbiased evaluation of our final model

We can do this by using [sklearn.model_selection.train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

Training data set - 80% of the total data

Testing data set - 20% of the total data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def score(y_test, y_pred):
    """Helper function for evaluation metrics."""
    explained_variance = explained_variance_score(y_test, y_pred) * 100
    mae = round(mean_absolute_error(y_test, y_pred), 2)
    print(f"""Explained Variance: {explained_variance:.2f}%
MAE: {mae:.2f}""")
    
    return explained_variance

In [None]:
accuracy_scores = np.zeros(6, dtype="float64")

## Linear Regression

In [None]:
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
accuracy_scores[0] = score(y_test, y_pred)

## Lasso Regressor

In [None]:
reg1 = Lasso().fit(X_train, y_train)
y_pred1 = reg1.predict(X_test)
accuracy_scores[1] = score(y_test, y_pred1)

## Decision Tree Regressor

In [None]:
reg2 = DecisionTreeRegressor().fit(X_train, y_train)
y_pred2 = reg2.predict(X_test)
accuracy_scores[2] = score(y_test, y_pred2)

## Support Vector Regressor

In [None]:
reg3 = SVR().fit(X_train, y_train)
y_pred3 = reg3.predict(X_test)
accuracy_scores[3] = score(y_test, y_pred3)

## Random Forest Regressor

In [None]:
reg4 = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_train, y_train)
y_pred4 = reg4.predict(X_test)
accuracy_scores[4] = score(y_test, y_pred4)

## Neural Network Regression

In [None]:
tf.random.set_seed(42)

reg5 = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1),
])

reg5.compile(loss=tf.keras.losses.mae,
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
              metrics=["mae"])

history = reg5.fit(X_train, y_train, epochs=100, verbose=0)

In [None]:
reg5.evaluate(X_test, y_test)

In [None]:
y_pred5 = reg5.predict(X_test)
accuracy_scores[5] = score(y_test, y_pred5)

# Conclusion

In [None]:
models = [
          "Linear Regression", "Lasso Regressor", "Decision Tree Regressor",
          "Support Vector Regressor", "Random Forest Regressor", "Neural Network Regression",
         ]

plt.figure(figsize=(10, 6))
sns.barplot(x=models, y=accuracy_scores)

plt.xlabel("Model Name")
plt.xticks(rotation = -90)
plt.ylabel("Accuracy")

plt.show()

Accuracy: 94.76%

MAE: 2.54