In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load raw data

In [None]:
raw_data = pd.read_csv('../input/student-grade-prediction/student-mat.csv')
raw_data.head(8)

In [None]:
raw_data.shape

# Explore data

In [None]:
raw_data.describe()

## Searching for missing values

In [None]:
raw_data.isnull().sum()

## Variable of interest

I am going to drop daily and weekend alcohol consumption, also go out because I think it might be related to freetime.

In [None]:
data = raw_data.drop(raw_data[['Dalc', 'Walc', 'goout']], axis=1)
data.head()

In [None]:
sns.distplot(data['G3'])

In [None]:
data.drop(data[data['G3'] < 1].index, inplace = True)
data.describe()

In [None]:
sns.distplot(data['G3'])

## Make dummy variables

In [None]:
data = pd.get_dummies(data, drop_first=True)
data.head()

# Linear regression

In [None]:
targets = data['G3']
inputs = data.drop(['G3'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=80)

In [None]:
reg = LinearRegression()
reg.fit(x_train, y_train)

In [None]:
y_hat = reg.predict(x_train)

In [None]:
plt.scatter(y_train, y_hat)
plt.xlabel('Targets (y_train)', size=18)
plt.ylabel('Inputs (y_hat)', size=18)
plt.xlim(2, 22)
plt.ylim(2, 22)
plt.show()

In [None]:
sns.distplot(y_train - y_hat)
plt.title('Residuals PDF', size=18)

In [None]:
reg.score(x_train, y_train)

In [None]:
reg_summary = pd.DataFrame(inputs.columns.values, columns=['Features'])
reg_summary['Weight'] = reg.coef_
reg_summary

# Testing the model

In [None]:
y_hat_test = reg.predict(x_test)

In [None]:
plt.scatter(y_test, y_hat_test)
plt.xlabel('Targets (y_test)', size=18)
plt.ylabel('Predictions (y_hat_test)', size=18)
plt.xlim(2, 22)
plt.ylim(2, 20)
plt.show()

m_pf = Model Performance

In [None]:
m_pf = pd.DataFrame(y_hat_test, columns=['Prediction'])
m_pf.head()

In [None]:
y_test = y_test.reset_index(drop=True)
m_pf['Targets'] = y_test
m_pf

In [None]:
m_pf['Residuals'] = m_pf['Targets'] - m_pf['Prediction']
m_pf

In [None]:
m_pf['Differences']= np.absolute(m_pf['Residuals']/m_pf['Targets']*100)
m_pf

In [None]:
m_pf.describe()

In [None]:
pd.options.display.max_rows=999
pd.set_option('display.float_format', lambda x: '%.2f' % x)
m_pf.sort_values('Differences')