In [None]:
#import necessary modules
#Python >= 3.5
import sys
assert sys.version_info >= (3, 5)

#Scikit-Learn >= 0.20
import sklearn as sk
assert sk.__version__ >= "0.20"

#numpy for calculations
import numpy as np
import os

#matplotlib for figures
import matplotlib as mpl
import matplotlib.pyplot as plt
#a magic function that allows inline plotting so figures are rendered in this notebook
%matplotlib inline

#pandas for ...
import pandas as pd
from pandas.plotting import scatter_matrix

#add other needed modules here
#Tensorflow?

## Setup and Feature Statistics

In [None]:
# load data from the cv file using pandas
scores = pd.read_csv("Complete MCAT Data.csv", header = 1)

# rename some columns for consistency
scores.rename(columns = {'Days Before Exam':'Days Before Exam.1', 'Days Before Exam.1':'Days Before Exam.2', 'Days Before Exam.2':'Days Before Exam.3', 'Days Before Exam.3':'Days Before Exam.4'}, inplace = True)

# preview first five lines
scores.head()

In [None]:
# drop irrelevant columns
scores.drop(columns=['Timestamp', 'Exam Date', 'C/P Score', 'CARS Score', 'B/B Score', 'P/S Score', 'Total Score', 'Total Score.1', 'Total Score.2', 'Total Score.3'], inplace = True)

# sanity check: take a look at the data
print('Scores data shape:', scores.shape)

# look at distribution of scores
scores['Real Score'].value_counts()

# gives count, mean, standard deviation, min, max, and percentiles (including median)
scores.describe(percentiles = [0.25, 0.5, 0.75], include = 'all')

In [None]:
# plot some data statistics for the report
# for report: (state the number of datapoints, briefly describe the dataset)

## Visualize and Handle Missing Data

In [None]:
import missingno as msno

# look at how many values are missing in each dataframe column
missing_values_table(scores)

# visualize missing data with Missingno
msno.bar(scores)
msno.matrix(scores)

# see if there is a reason for missing data
msno.heatmap(scores)
msno.dendrogram(scores)

In [None]:
# for linear regression, drop all rows with missing values
scores.dropna(axis = 0, inplace = True)

# convert all remaining data to integers
for col in scores.columns:
        scores[col] = scores[col].astype(int)

In [None]:
# print number of columns, column labels, column data types, memory usage, range index, and non-null number of cells in each column
scores.info()

# preview data
scores.head()

In [None]:
#for isnegative in (scores > 0).all(1)[i]:
#        if isnegative...

## Clean Data and Add New Features

In [None]:
# drop rows with # negative values
#                # days before exam > 180
#                # exact duplicate values to another row
#                # impossible scores (>528 total or >132 on any subsection)
remove_indices = []
remove_indices.extend(scores[scores['Real Score']>528].index.values)
remove_indices.extend(scores[scores['Real Score']<472].index.values)
for i in range(1, 5):
    remove_indices.extend(scores[scores['Days Before Exam.'+str(i)]>200].index.values)
    remove_indices.extend(scores[scores['C/P Score.'+str(i)]>132].index.values)
    remove_indices.extend(scores[scores['CARS Score.'+str(i)]>132].index.values)
    remove_indices.extend(scores[scores['B/B Score.'+str(i)]>132].index.values)
    remove_indices.extend(scores[scores['P/S Score.'+str(i)]>132].index.values)

# remove duplicate indices
remove_indices = list(dict.fromkeys(remove_indices))

# delete all rows with impossible values
for i in remove_indices:
    scores.drop(i, axis = 0, inplace = True)

scores.info()
scores.head()

# add custom attribute: variance between test scores (if multiple test scores)
# NOTE: make sure variance is calculated between different tests, not between subsections of the same test

## Create a Test Set

In [None]:
# define features and labels from the observations
features = scores.columns[1:len(scores)]
X = scores[features].values.reshape(-1, len(features))
y = scores["Real Score"].to_numpy()
print(X.shape,y.shape)

In [None]:
from sklearn.model_selection import train_test_split

#split data into training and testing, fix random_state so output is the same every run
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True,
                                   test_size=0.20, random_state=0)

#sanity check: look at how many data points are in train/test
print(X_train.shape[0])
print(X_test.shape[0])

In [None]:
#check the distributions of the training and testing sets
plt.hist(y_train)
plt.show()

In [None]:
plt.hist(y_test)
plt.show() 

In [None]:
#sanity check: preview test and training data
print(X_train)
print(X_val)

In [None]:
#from sklearn.model_selection import StratifiedShuffleSplit
#perform stratified shufflesplit cross-validator
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html
#compares random and stratified error

## Visualize Correlations

In [None]:
# create a correlation matrix with pandas
corr_matrix = scores.corr()
corr_matrix['Real Score'].sort_values(ascending=False)

# create scatter matrices displaying totals, medians.
from pandas.plotting import scatter_matrix

attributes = ['Real Score', 'CARS Score.1', 'CARS Score.2', 'CARS Score.3', 'CARS Score.4']
scatter_matrix(scores[attributes], figsize=(12, 8))

# Linear Regression

## Train the model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

lin_reg0 = LinearRegression()
lin_reg0.fit(X_train, y_train)

## Validate the Model

In [None]:
# k-fold cross validation

## Loss Function and Accuracy

In [None]:
# mean squared error
def lin_error(y_val, y_pred):
    import math
    from sklearn.metrics import mean_squared_error
    assert len(y_pred) == len(y_val), "the length of y_pred is incorrect"
    lin_mse = mean_squared_error(y_val, y_pred, squared=True)
    lin_rmse = math.sqrt(lin_mse)
    print("mean squared error: ", lin_mse)
    print("root mean squared error: ", lin_rmse)

In [None]:
y_pred = lin_reg0.predict(X_val)

# compute accuracy on the training set
accuracy = lin_reg0.score(X_val, y_val)

print("accuracy of LinReg : ", accuracy)

assert len(y_pred) == len(y_val), "the length of y_pred is incorrect"

lin_error(y_val, y_pred)

# Linear Regression with Dimensionality Reduction

## PCA to Create Linear Combinations of Features

In [None]:
from sklearn.decomposition import PCA

# This code is from Assignment 2

# fit the PCA
N = 8
pca = PCA(n_components=N)
X_train_reduced = pca.fit_transform(X_train)
X_val_reduced = pca.fit_transform(X_val)

# plot the explained variances
fig, ax1 = plt.subplots(figsize=(12, 5))
color = 'tab:blue'
ax1.bar(1+np.arange(N), pca.explained_variance_ratio_, color=color)
ax1.set_xticks(1+np.arange(N, step=2))
ax1.tick_params(axis='y', labelcolor=color)
ax1.set_ylabel("Explained variance ratio", color=color)
ax1.set_xlabel("Generated feature")

ax2 = ax1.twinx()
color = 'tab:red'
ax2.tick_params(axis='y', labelcolor=color)
ax2.plot(1+np.arange(N), np.cumsum(pca.explained_variance_ratio_), color=color)
ax2.set_ylabel("Cumulative explained variance ratio", color=color)
fig.tight_layout()
plt.show()

## Training and Evaluating on the Training Set

In [None]:
# initialize a linear regression object
lin_reg1 = LinearRegression() #solver = ?
lin_reg1.fit(X_train_reduced, y_train)

## Validate the Model

## Loss Function and Accuracy

In [None]:
lin_accuracy = lin_reg.score(X_val_reduced, y_val)
print(f"Prediction accuracy: {100*lin_accuracy:.2f}%")

y_pred = lin_reg.predict(X_val_reduced)

# loss function
lin_error(y_val, y_pred)

# Compare Linear Regression with and without Dimensionality Reduction

In [None]:
# initialize a linear regression object
lin_reg = LinearRegression() #solver = ?
lin_reg.fit(X_train_reduced, y_train)

lin_accuracy = lin_reg.score(X_val_reduced, y_val)

# sanity check: test out predictions and compare to labels
#print(lin_reg.predict(X_train_reduced))
#print(list(y_train))

print(f"Prediction accuracy: {100*lin_accuracy:.2f}%")

In [None]:
# loss function
from sklearn.metrics import mean_squared_error

y_pred = lin_reg.predict(X_val_reduced)

assert len(y_pred) == len(y_val), "the length of y_pred is incorrect"

lin_rmse = mean_squared_error(y_val, y_pred, squared=False)
lin_rmse

In [None]:
from sklearn.model_selection import cross_val_score

lin_scores = cross_val_score(lin_reg, ###, ###,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)

def display_scores(scores):
    print("Scores:", lin_scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
display_scores(lin_rmse_scores)

## Data visualisation

In [None]:
#Display what pca 1 is
plt.figure()

print(X.shape)
D,E = np.linalg.eig(np.matmul(X.T,X))
#print(D)
#print(D.shape)
#E = E.reshape()
z = list(scores.columns[1:])#np.arange(1,21)
#print(z)

fig, ax = plt.subplots(figsize= (8,4))
# We see that the highest eigenvalue is D[7] so row 7 of E is 
ax.bar(z, E[:,7])#, label=bar_labels, color=bar_colors)

ax.set_ylabel('PCA 1 dimension')
ax.set_title('PCA 1 in fonction of the features')
for label in ax.get_xticklabels(which='major'):
    label.set(rotation=30, horizontalalignment='right',fontsize=8)
#ax.legend(title='Fruit color')

plt.show()

#plt.plot(z,E[0,:])

In [None]:
plt.figure(figsize=(10, 8))
# convert the labels to numbers, each will be assigned a separate color based on the cmap specified
colors = [int(x) for x in y_train]
sc = plt.scatter(X_train_reduced[:, 0], X_train_reduced[:, 1], c=colors)#,s=1 cmap='tab10')
plt.xlabel("PCA1")
plt.ylabel("PCA2")
plt.legend(*sc.legend_elements(), title='digit')
plt.show()


In [None]:
#3D plotting
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(projection='3d')
ax.scatter(X_train_reduced[:, 0], X_train_reduced[:, 1], X_train_reduced[:,2], c=colors)
ax.set_xlabel('PCA 1')
ax.set_ylabel('PCA 2')
ax.set_zlabel('PCA 3')
plt.legend(*sc.legend_elements(), title='digit',loc="upper center")
plt.show()


In [None]:
#3D plotting
fig = plt.figure(figsize=(7,7))
ax = fig.add_subplot(projection='3d')
ax.scatter(X_train_reduced[:, 0], X_train_reduced[:, 1], y_train)#, c=colors)
ax.set_xlabel('PCA 1')
ax.set_ylabel('PCA 2')
ax.set_zlabel("Final score")
#plt.legend(*sc.legend_elements(), title='digit',loc="upper center")
plt.title("Final scores at exam of the training set in function of PCA1 and PCA2 parameters")
plt.show()


In [None]:
plt.figure(figsize=(6, 6))
# convert the labels to numbers, each will be assigned a separate color based on the cmap specified
colors = [int(x) for x in y_train]
sc = plt.scatter(X_train_reduced[:, 0],y_train)#, c=colors)#,s=1 cmap='tab10')
plt.xlabel("PCA1")
plt.ylabel("Final score")
#plt.legend(*sc.legend_elements(), title='digit')
plt.title("Final scores at exam of the training set in function of PCA1 parameter")
plt.show()
