In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load dataset and packages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

data = pd.read_csv('/kaggle/input/predict-test-scores-of-students/test_scores.csv')

In [None]:
data.columns

# Basic information about the data

In [None]:
data.describe()

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.dtypes

##### Looks like there's quite a bit of categorical features inside the dataset.

##### Check for any missing values

In [None]:
data.isnull().sum()

##### Good news! There are no missing values inside the data. let's plot a histogram of the post test results.

In [None]:
sns.distplot(data['posttest'])

## Exploratory Data Analysis

#### Numercial values

In [None]:
data_corr = data.corr()
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(data_corr, annot=True)

In [None]:
# Categorical columns and their unique values
non_num = [data.columns[index] for index, dtype in enumerate(data.dtypes) if dtype == 'object']
for column in non_num:
    print(f"{column}: {data[column].unique()}")

##### Let's look at some the relationship between the categorical values and the post test score

In [None]:
non_num

In [None]:
fig, ax = plt.subplots(figsize=(15,15))
sns.boxplot(x='school', y='posttest', data=data)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sns.boxplot(x='school_setting', y='posttest', data=data)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sns.boxplot(x='school_type', y='posttest', data=data)

In [None]:
fig, ax = plt.subplots(figsize=(20,20))
sns.boxplot(x='classroom', y='posttest', data=data)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sns.boxplot(x='teaching_method', y='posttest', data=data)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sns.boxplot(x='gender', y='posttest', data=data)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sns.boxplot(x='lunch', y='posttest', data=data)

In [None]:
data.head()

##### Dummy values approach for categorical features with multiple unique values

In [None]:
dummies_1 = pd.concat([pd.get_dummies(data['school'])], axis =1)
data = pd.concat([data, dummies_1], axis=1)
data.drop(['school'], axis=1, inplace=True)

dummies_2 = pd.concat([pd.get_dummies(data['school_setting'])], axis =1)
data = pd.concat([data, dummies_2], axis=1)
data.drop(['school_setting'], axis=1, inplace=True)

dummies_3 = pd.concat([pd.get_dummies(data['classroom'])], axis =1)
data = pd.concat([data, dummies_3], axis=1)
data.drop(['classroom'], axis=1, inplace=True)

In [None]:
# Label encoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

data['school_type'] = label_encoder.fit_transform(data['school_type']) 
data['teaching_method'] = label_encoder.fit_transform(data['teaching_method']) 
data['gender'] = label_encoder.fit_transform(data['gender']) 
data['lunch'] = label_encoder.fit_transform(data['lunch']) 

In [None]:
data.head()

## Model training

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

##### Making a function to print and create a dictionary with errors and r2 score:

In [None]:
def evaluating(y_act,y_pred):
    MAE = mean_absolute_error(y_act, y_pred)
    MSE = mean_squared_error(y_act, y_pred)
    r2 = r2_score(y_act, y_pred)
    eval_values = {'MAE': round(MAE,3), 'MSE' : round(MSE,3), 'r2' : round(r2,3)}
    print('Results:\n Mean absolute error = ', round(MAE,3),'\n Mean squared error = ', round(MSE,3),'\n R2 = ', round(r2,3))

    return eval_values

In [None]:
# Split training and testing dataset
X = data.drop(['student_id', 'posttest', 'gender'], axis=1)
y = data['posttest']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Standard scale the numerical data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


X_test = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
X_train = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

In [None]:
# Split training and testing dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### Linear Regression

In [None]:
model_lin= LinearRegression()
model_lin.fit(X_train, y_train)
predictions_lin = model_lin.predict(X_test)
score_lim = evaluating(y_test,predictions_lin)

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model_log= LogisticRegression(solver='liblinear')
model_log.fit(X_train, y_train)
predictions_log = model_log.predict(X_test)
score_log = evaluating(y_test,predictions_log)

In [None]:
print("Linear Regression: ", score_lim)
print("Logistic Regression: ", score_log)