In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%notebook inline

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split

# Reading file and checking head

In [None]:
df = pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")

print(df.shape)
df.head()

# The dataset contains multiple score columns so lets first encode the values using the label encoder and find the correlation 

In [None]:
for col in df.select_dtypes("object"):
    print(df[col].value_counts())
    print("\n")

In [None]:
df1 = df.copy()

enc = LabelEncoder()
dict_ls = [] # list for referencing class

for col in df1.select_dtypes("object"):
    df1[col] = enc.fit_transform(df1[col])
    col_dict = {index: val for index,val in enumerate(enc.classes_)}
    dict_ls.append(col_dict)

df1.head()

In [None]:
for i in dict_ls:
    print(i)
    print("\n")

In [None]:
import seaborn as sns
corr = df1.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr, annot=True)

# From the above heatmap we can see the scores are positively correlated to each other.

# So, we can use 2 score cols to find the value of another. We can also use race and lunch cols

In [None]:
df2 = df1.copy()

df2.drop(["gender", "parental level of education", "test preparation course"], axis=1, inplace=True) # race and lunch cols with scores

# only scores
#df2.drop(df2.iloc[:,:5], axis=1, inplace=True) # before commenting out this comment the above line

df2.head()

# Now we have to train our models 3 times as there are 3 cols whose value we wish to predict

In [None]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
}

In [None]:
def splitData(df, col):
    X = df.drop(col, axis =1)
    y = df[col]
    return X,y

X1, y1 = splitData(df2, "math score")
X2, y2 = splitData(df2, "reading score")
X3, y3 = splitData(df2, "writing score")

In [None]:
def tr_ts_split(X,y):
    
    sc = StandardScaler()

    X = pd.DataFrame(sc.fit_transform(X), columns=X1.columns)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)
    
    return X_train, X_test, y_train, y_test

In [None]:
def trainModels(X_train, y_train):
    for name, model in models.items():
        model.fit(X_train, y_train)
        print(name + " trained.")

In [None]:
def testModels(X_test, y_test):
    for name, model in models.items():
        print(name + " Accuracy: {:.5f}".format(model.score(X_test, y_test)))
                     

# models for math score

In [None]:
X_train, X_test, y_train, y_test = tr_ts_split(X1, y1)
trainModels(X_train, y_train)
print("\n")
testModels(X_test, y_test)

# models for reading score

In [None]:
X_train, X_test, y_train, y_test = tr_ts_split(X2, y2)
trainModels(X_train, y_train)
print("\n")
testModels(X_test, y_test)

# models for writing score

In [None]:
X_train, X_test, y_train, y_test = tr_ts_split(X3, y3)
trainModels(X_train, y_train)
print("\n")
testModels(X_test, y_test)

# We can also one hot encode race and lunch cols to see if it gives better results

In [None]:
df3 = df.copy()
df3.drop(["gender", "parental level of education", "test preparation course"], axis=1, inplace=True)
df3.head()

In [None]:
def oneHotEncode(df, col):
    df = df.copy()
    dummies = pd.get_dummies(df[col])
    df = pd.concat([df, dummies], axis =1)
    df.drop(col, axis=1, inplace=True)
    return df

In [None]:
df3 = oneHotEncode(df3, "race/ethnicity")
df3 = oneHotEncode(df3, "lunch")

In [None]:
df3.head()

In [None]:
X1, y1 = splitData(df3, "math score")
X2, y2 = splitData(df3, "reading score")
X3, y3 = splitData(df3, "writing score")

# math model

In [None]:
X_train, X_test, y_train, y_test = tr_ts_split(X1, y1)
trainModels(X_train, y_train)
print("\n")
testModels(X_test, y_test)

# reading model

In [None]:
X_train, X_test, y_train, y_test = tr_ts_split(X2, y2)
trainModels(X_train, y_train)
print("\n")
testModels(X_test, y_test)

# writing model

In [None]:
X_train, X_test, y_train, y_test = tr_ts_split(X3, y3)
trainModels(X_train, y_train)
print("\n")
testModels(X_test, y_test)

# Reading and writing scores can predict each other very well but are not so good for predicting math score