In [104]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score

import os

import warnings
warnings.filterwarnings('ignore')

# 1. Getting the data

The used dataset can be found here: https://www.kaggle.com/spscientist/students-performance-in-exams .

In [105]:
# Load in DataFrame from Kaggle
df = pd.read_csv("StudentsPerformance.csv")

In [106]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [107]:
df.shape

(1000, 8)

## Datacleaning

The replacement of the string values by numerical values was taken from https://www.kaggle.com/suyashpratapsingh/eda-prediction-of-student-performance-in-exams .

In [108]:
# Add column with average grade
grades = df.loc[: , "math score":"writing score"]
df['avg'] = grades.mean(axis=1)

dfc = df

# Replace string values with numeric values
dfc['gender'].replace({'male':'0','female':'1'},inplace=True)
dfc['race/ethnicity'].replace({'group A':'1','group B':'2', 'group C':'3',
                               'group D':'4','group E':'5'},inplace=True)
dfc['lunch'].replace({'free/reduced':'0','standard':'1'},inplace=True)
dfc['test preparation course'].replace({'none':'0','completed':'1'},inplace=True)
dfc['parental level of education'].replace({'some high school':'1','high school':'1',"associate's degree":'2',
                                        'some college':'3',"bachelor's degree":'4',"master's degree":'5'},inplace=True)

dfc.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,avg
0,1,2,4,1,0,72,72,74,72.666667
1,1,3,3,1,1,69,90,88,82.333333
2,1,2,5,1,0,90,95,93,92.666667
3,0,1,2,0,0,47,57,44,49.333333
4,0,3,3,1,0,76,78,75,76.333333


# 2 Linear regression model

In [109]:
# Training data and labels
X = dfc.drop(['writing score', 'avg'], axis=1) #Independent variable 
y = dfc['writing score'] #dependent variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23)

In [119]:
# Linear regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [111]:
# Predict values
y_pred = lin_reg.predict(X_test)

In [112]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R_square score: ", r2_score(y_test,y_pred))
print('Coefficients: \n', lin_reg.coef_)

MAE: 2.7920136785114353
MSE: 12.457574123776919
RMSE: 3.5295288812781966
R_square score:  0.9494693054074756
Coefficients: 
 [5.87869614 0.14808674 0.85929315 0.49358045 3.56035003 0.27341947
 0.66476891]


# 3. Dataset description

In [113]:
df_train = X_train
df_train['grade'] = y_train

df_test = X_test
df_test['grade'] = y_test

## Average grades of different groups in dataset

Below, the average writing grades of each group in both the train and the test set are calculated.

In [114]:
A_train = np.mean(df_train.loc[df_train['race/ethnicity'] == '1']['grade'])
B_train = np.mean(df_train.loc[df_train['race/ethnicity'] == '2']['grade'])
C_train = np.mean(df_train.loc[df_train['race/ethnicity'] == '3']['grade'])
D_train = np.mean(df_train.loc[df_train['race/ethnicity'] == '4']['grade'])
E_train = np.mean(df_train.loc[df_train['race/ethnicity'] == '5']['grade'])

F_train = np.mean(df_train.loc[df_train['gender'] == '1']['grade'])
M_train = np.mean(df_train.loc[df_train['gender'] == '0']['grade'])

FL_train = np.mean(df_train.loc[df_train['lunch'] == '0']['grade'])
SL_train = np.mean(df_train.loc[df_train['lunch'] == '1']['grade'])

A_test = np.mean(df_test.loc[df_test['race/ethnicity'] == '1']['grade'])
B_test = np.mean(df_test.loc[df_test['race/ethnicity'] == '2']['grade'])
C_test = np.mean(df_test.loc[df_test['race/ethnicity'] == '3']['grade'])
D_test = np.mean(df_test.loc[df_test['race/ethnicity'] == '4']['grade'])
E_test = np.mean(df_test.loc[df_test['race/ethnicity'] == '5']['grade'])

F_test = np.mean(df_test.loc[df_test['gender'] == '1']['grade'])
M_test = np.mean(df_test.loc[df_test['gender'] == '0']['grade'])

FL_test = np.mean(df_test.loc[df_test['lunch'] == '0']['grade'])
SL_test = np.mean(df_test.loc[df_test['lunch'] == '1']['grade'])

print("Group A train: {}, test: {}".format(round(A_train,2), round(A_test,2)))
print("Group B train: {}, test: {}".format(round(B_train,2), round(B_test,2)))
print("Group C train: {}, test: {}".format(round(C_train,2), round(C_test,2)))
print("Group D train: {}, test: {}".format(round(D_train,2), round(D_test,2)))
print("Group E train: {}, test: {}".format(round(E_train,2), round(E_test,2)))

print("\nFemale train: {}, test: {}".format(round(F_train,2), round(F_test,2)))
print("Male train: {}, test: {}".format(round(M_train,2), round(M_test,2)))

print("\nFemale train: {}, test: {}".format(round(F_train,2), round(F_test,2)))
print("Male train: {}, test: {}".format(round(M_train,2), round(M_test,2)))
print("\nFemale train: {}, test: {}".format(round(F_train,2), round(F_test,2)))
print("Male train: {}, test: {}".format(round(M_train,2), round(M_test,2)))

print("\nFree/reduced lunch: {}, test: {}".format(round(FL_train,2), round(FL_test,2)))
print("Standard lunch train: {}, test: {}".format(round(SL_train,2), round(SL_test,2)))

Group A train: 61.93, test: 64.13
Group B train: 65.84, test: 65.04
Group C train: 68.64, test: 65.85
Group D train: 71.25, test: 67.64
Group E train: 71.76, test: 70.53

Female train: 72.58, test: 72.17
Male train: 64.2, test: 61.43

Female train: 72.58, test: 72.17
Male train: 64.2, test: 61.43

Female train: 72.58, test: 72.17
Male train: 64.2, test: 61.43

Free/reduced lunch: 63.6, test: 61.5
Standard lunch train: 71.6, test: 69.11


Below, the average writing grades of men and women in the preperation and no preperation set are calculated.

In [115]:
FP_train = np.mean(df_train.loc[(df_train['gender'] == '1') & (df_train['test preparation course'] == '0')]['grade'])
MP_train = np.mean(df_train.loc[(df_train['gender'] == '0') & (df_train['test preparation course'] == '0')]['grade'])
FP_test = np.mean(df_test.loc[(df_test['gender'] == '1') & (df_test['test preparation course'] == '0')]['grade'])
MP_test = np.mean(df_test.loc[(df_test['gender'] == '0') & (df_test['test preparation course'] == '0')]['grade'])


FNP_train = np.mean(df_train.loc[(df_train['gender'] == '1') & (df_train['test preparation course'] == '1')]['grade'])
MNP_train = np.mean(df_train.loc[(df_train['gender'] == '0') & (df_train['test preparation course'] == '1')]['grade'])
FNP_test = np.mean(df_test.loc[(df_test['gender'] == '1') & (df_test['test preparation course'] == '1')]['grade'])
MNP_test = np.mean(df_test.loc[(df_test['gender'] == '0') & (df_test['test preparation course'] == '1')]['grade'])

print("Preperation women train: {} test: {}".format(round(FP_train,2), round(FP_test,2)))
print("Preperation men train: {} test: {}".format(round(MP_train,2), round(MP_test,2)))
print("No preperation women train: {} test: {}".format(round(FNP_train,2), round(FNP_test,2)))
print("No preperation men train: {} test: {}".format(round(MNP_train,2), round(MNP_test,2)))

Preperation women train: 69.0 test: 68.92
Preperation men train: 60.69 test: 57.45
No preperation women train: 79.11 test: 77.98
No preperation men train: 70.42 test: 68.46


## Number of students per group

Below, the number of students witihin each group is calculated.

In [116]:
# Seperate dataframes for each ethnic group
groupA_df = df_res.loc[df_res['race/ethnicity'] == '1']
groupB_df = df_res.loc[df_res['race/ethnicity'] == '2']
groupC_df = df_res.loc[df_res['race/ethnicity'] == '3']
groupD_df = df_res.loc[df_res['race/ethnicity'] == '4']
groupE_df = df_res.loc[df_res['race/ethnicity'] == '5']

# Seperate dataframes for each gender
W_df = df_res.loc[df_res['gender'] == '1']
M_df = df_res.loc[df_res['gender'] == '0']

# Seperate dataframes for each lunch category
FL_df = df_res.loc[df_res['lunch'] == '0']
SL_df = df_res.loc[df_res['lunch'] == '1']

print('Number of people train set:')
print('Total:', len(X_train))

print('Group A:', len(X_train.loc[X_train['race/ethnicity'] == '1']))
print('Group B:', len(X_train.loc[X_train['race/ethnicity'] == '2']))
print('Group C:', len(X_train.loc[X_train['race/ethnicity'] == '3']))
print('Group D:', len(X_train.loc[X_train['race/ethnicity'] == '4']))
print('Group E:', len(X_train.loc[X_train['race/ethnicity'] == '5']))

print('Women:', len(X_train.loc[X_train['gender'] == '1']))
print('Men:', len(X_train.loc[X_train['gender'] == '0']))

print('Free/reduced lunch:', len(X_train.loc[X_train['lunch'] == '0']))
print('Standard lunch:', len(X_train.loc[X_train['lunch'] == '1']))


print('\nNumber of people test set:')
print('Total:', len(df_res))
print('Group A:', len(groupA_df))
print('Group B:', len(groupB_df))
print('Group C:', len(groupC_df))
print('Group D:', len(groupD_df))
print('Group E:', len(groupE_df))

print('Women:', len(W_df))
print('Men:', len(M_df))

print('Free/reduced lunch:', len(FL_df))
print('Standard lunch:', len(SL_df))

Number of people train set:
Total: 700
Group A: 59
Group B: 133
Group C: 226
Group D: 182
Group E: 100
Women: 373
Men: 327
Free/reduced lunch: 257
Standard lunch: 443

Number of people test set:
Total: 300
Group A: 30
Group B: 57
Group C: 93
Group D: 80
Group E: 40
Women: 145
Men: 155
Free/reduced lunch: 98
Standard lunch: 202


# 4. Analysis of prediction results
Below, a dataframe containing more comparison metrics is constructed.

In [117]:
# Dataframe containing results
df_res = X_test
df_res['prediction'] = y_pred
indeces = list(df_res.index)
df_res['writing score'] = dfc.loc[indeces]['writing score']
df_res['tot diff'] = df_res['prediction'] - df_res['writing score']
df_res['abs diff'] = abs(df_res['tot diff'])

df_res

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,grade,prediction,writing score,tot diff,abs diff
519,1,2,1,0,1,67,78,79,77.818667,79,-1.181333,1.181333
837,1,1,1,1,1,75,82,79,83.010592,79,4.010592,4.010592
208,1,2,3,0,0,74,81,76,79.885146,76,3.885146,3.885146
525,0,5,3,1,0,68,60,59,59.343627,59,0.343627,0.343627
978,0,4,1,1,1,55,41,48,44.852242,48,-3.147758,3.147758
...,...,...,...,...,...,...,...,...,...,...,...,...
91,0,3,1,0,0,27,34,36,28.341097,36,-7.658903,7.658903
114,1,5,4,1,1,99,100,100,104.708726,100,4.708726,4.708726
948,0,2,1,0,1,49,50,52,48.404891,52,-3.595109,3.595109
229,1,3,3,1,1,88,95,94,97.221801,94,3.221801,3.221801


Calculating how many students received a higher grade.

In [96]:
pos_tot = len(df_res.loc[df_res['tot diff'] >= 0])/len(df_res)
print('Total percentage that received higher score:', pos_tot)

Total percentage that received higher score: 0.5366666666666666
Total percentage that received higher letter score: 0.11333333333333333


## Seperate dataframes for preperation and no preperation

Seperate dataframes for the students who performed course preperation and who performed no course preperation.

In [101]:
# Dataframes where test preparation was done for the course
P_groupA_df = groupA_df.loc[groupA_df['test preparation course'] == '1']
P_groupB_df = groupB_df.loc[groupB_df['test preparation course'] == '1']
P_groupC_df = groupC_df.loc[groupC_df['test preparation course'] == '1']
P_groupD_df = groupD_df.loc[groupD_df['test preparation course'] == '1']
P_groupE_df = groupE_df.loc[groupE_df['test preparation course'] == '1']

P_W_df = W_df.loc[W_df['test preparation course'] == '1']
P_M_df = M_df.loc[M_df['test preparation course'] == '1']

P_FL_df = FL_df.loc[FL_df['test preparation course'] == '1']
P_SL_df = SL_df.loc[SL_df['test preparation course'] == '1']

# Dataframes where NO test preparation was done for the course
NP_groupA_df = groupA_df.loc[groupA_df['test preparation course'] == '0']
NP_groupB_df = groupB_df.loc[groupB_df['test preparation course'] == '0']
NP_groupC_df = groupC_df.loc[groupC_df['test preparation course'] == '0']
NP_groupD_df = groupD_df.loc[groupD_df['test preparation course'] == '0']
NP_groupE_df = groupE_df.loc[groupE_df['test preparation course'] == '0']

NP_W_df = W_df.loc[W_df['test preparation course'] == '0']
NP_M_df = M_df.loc[M_df['test preparation course'] == '0']

NP_FL_df = FL_df.loc[FL_df['test preparation course'] == '0']
NP_SL_df = SL_df.loc[SL_df['test preparation course'] == '0']

## Evaluation 1: total results

Evaulation of the total results (preperation and no preperation). <br>
Calculates per group the mean absolute difference and the mean total difference between predicted and actual grade, and the fraction of cases where the predicted grade was higher than the actual grade.

In [102]:
# Evaluation of the results
print("Group A average error total: {}, absolute: {}".format(round(np.mean(groupA_df['tot diff']),2), round(np.mean(groupA_df['abs diff']),2)))
print("Group B average error total: {}, absolute: {}".format(round(np.mean(groupB_df['tot diff']),2), round(np.mean(groupB_df['abs diff']),2)))
print("Group C average error total: {}, absolute: {}".format(round(np.mean(groupC_df['tot diff']),2), round(np.mean(groupC_df['abs diff']),2)))
print("Group D average error total: {}, absolute: {}".format(round(np.mean(groupD_df['tot diff']),2), round(np.mean(groupD_df['abs diff']),2)))
print("Group E average error total: {}, absolute: {}".format(round(np.mean(groupE_df['tot diff']),2), round(np.mean(groupE_df['abs diff']),2)))

print("\nWomen average error total: {}, absolute: {}".format(round(np.mean(W_df['tot diff']),2), round(np.mean(W_df['abs diff']),2)))
print("Men average error total: {}, absolute: {}".format(round(np.mean(M_df['tot diff']),2), round(np.mean(M_df['abs diff']),2)))

print("\nFree lunch average error total: {}, absolute: {}".format(round(np.mean(FL_df['tot diff']),2), round(np.mean(FL_df['abs diff']),2)))
print("Standard lunch average error total: {}, absolute: {}".format(round(np.mean(SL_df['tot diff']),2), round(np.mean(SL_df['abs diff']),2)))

# Calculate how many of the predictions were higher than the actual grade
pos_A = len(groupA_df.loc[groupA_df['tot diff'] >= 0])/len(groupA_df)
pos_B = len(groupB_df.loc[groupB_df['tot diff'] >= 0])/len(groupB_df)
pos_C = len(groupC_df.loc[groupC_df['tot diff'] >= 0])/len(groupC_df)
pos_D = len(groupD_df.loc[groupD_df['tot diff'] >= 0])/len(groupD_df)
pos_E = len(groupE_df.loc[groupE_df['tot diff'] >= 0])/len(groupE_df)

pos_W = len(W_df.loc[W_df['tot diff'] >= 0])/len(W_df)
pos_M = len(M_df.loc[M_df['tot diff'] >= 0])/len(M_df)

pos_FL = len(FL_df.loc[FL_df['tot diff'] >= 0])/len(FL_df)
pos_SL = len(SL_df.loc[SL_df['tot diff'] >= 0])/len(SL_df)

print("\nFractions higher prediction:")
print("Group A:", round(pos_A,2))
print("Group B:", round(pos_B,2))
print("Group C:", round(pos_C,2))
print("Group D:", round(pos_D,2))
print("Group E:", round(pos_E,2))

print("\nWomen:", round(pos_W,2))
print("Men:", round(pos_M,2))

print("\nFree lunch:", round(pos_FL,2))
print("Standard lunch:", round(pos_SL,2))

Group A average error total: -0.65, absolute: 2.57
Group B average error total: 0.62, absolute: 2.78
Group C average error total: 0.39, absolute: 3.06
Group D average error total: -0.43, absolute: 2.48
Group E average error total: 2.15, absolute: 2.98

Women average error total: 0.62, absolute: 2.95
Men average error total: 0.09, absolute: 2.64

Free lunch average error total: 0.01, absolute: 2.82
Standard lunch average error total: 0.51, absolute: 2.78

Percentages higher prediction:
Group A: 0.47
Group B: 0.54
Group C: 0.49
Group D: 0.49
Group E: 0.78

Women: 0.55
Men: 0.52

Free lunch: 0.5
Standard lunch: 0.55


## Evaluation 2: students who performed preperation
Evaluation of the students who performed course preperation. <br>
Calculates per group the mean absolute difference and the mean total difference between predicted and actual grade, and the fraction of cases where the predicted grade was higher than the actual grade.

In [124]:
# Evaluation of the results where test preparation was done for the course
print("Preperation")
print("Group A average error total: {}, absolute: {}".format(round(np.mean(P_groupA_df['tot diff']),2), round(np.mean(P_groupA_df['abs diff']),2)))
print("Group B average error total: {}, absolute: {}".format(round(np.mean(P_groupB_df['tot diff']),2), round(np.mean(P_groupB_df['abs diff']),2)))
print("Group C average error total: {}, absolute: {}".format(round(np.mean(P_groupC_df['tot diff']),2), round(np.mean(P_groupC_df['abs diff']),2)))
print("Group D average error total: {}, absolute: {}".format(round(np.mean(P_groupD_df['tot diff']),2), round(np.mean(P_groupD_df['abs diff']),2)))
print("Group E average error total: {}, absolute: {}".format(round(np.mean(P_groupE_df['tot diff']),2), round(np.mean(P_groupE_df['abs diff']),2)))

print("\nWomen average error total: {}, absolute: {}".format(round(np.mean(P_W_df['tot diff']),2), round(np.mean(P_W_df['abs diff']),2)))
print("Men average error total: {}, absolute: {}".format(round(np.mean(P_M_df['tot diff']),2), round(np.mean(P_M_df['abs diff']),2)))

print("\nFree lunch average error total: {}, absolute: {}".format(round(np.mean(P_FL_df['tot diff']),2), round(np.mean(P_FL_df['abs diff']),2)))
print("Standard lunch average error total: {}, absolute: {}".format(round(np.mean(P_SL_df['tot diff']),2), round(np.mean(P_SL_df['abs diff']),2)))

# Calculate how many of the predictions were higher than the actual grade
P_pos_A = len(P_groupA_df.loc[P_groupA_df['tot diff'] >= 0])/len(P_groupA_df)
P_pos_B = len(P_groupB_df.loc[P_groupB_df['tot diff'] >= 0])/len(P_groupB_df)
P_pos_C = len(P_groupC_df.loc[P_groupC_df['tot diff'] >= 0])/len(P_groupC_df)
P_pos_D = len(P_groupD_df.loc[P_groupD_df['tot diff'] >= 0])/len(P_groupD_df)
P_pos_E = len(P_groupE_df.loc[P_groupE_df['tot diff'] >= 0])/len(P_groupE_df)

P_pos_W = len(P_W_df.loc[P_W_df['tot diff'] >= 0])/len(P_W_df)
P_pos_M = len(P_M_df.loc[P_M_df['tot diff'] >= 0])/len(P_M_df)

P_pos_FL = len(P_FL_df.loc[P_FL_df['tot diff'] >= 0])/len(P_FL_df)
P_pos_SL = len(P_SL_df.loc[P_SL_df['tot diff'] >= 0])/len(P_SL_df)

print("\nFractions higher prediction:")
print("Group A:", round(P_pos_A,2)*100)
print("Group B:", round(P_pos_B,2)*100)
print("Group C:", round(P_pos_C,2)*100)
print("Group D:", round(P_pos_D,2)*100)
print("Group E:", round(P_pos_E,2)*100)

print("\nWomen:", round(P_pos_W,2))
print("Men:", round(P_pos_M,2))

print("\nFree lunch:", round(P_pos_FL,2))
print("Standard lunch:", round(P_pos_SL,2))

Preperation
Group A average error total: 0.29, absolute: 2.63
Group B average error total: 1.18, absolute: 3.37
Group C average error total: -0.12, absolute: 2.42
Group D average error total: -1.18, absolute: 2.76
Group E average error total: 3.15, absolute: 3.36

Women average error total: 1.09, absolute: 2.91
Men average error total: -0.18, absolute: 2.83

Free lunch average error total: 0.13, absolute: 2.93
Standard lunch average error total: 0.61, absolute: 2.83

Percentages higher prediction:
Group A: 43
Group B: 57
Group C: 41
Group D: 37
Group E: 88

Women: 0.6
Men: 0.43

Free lunch: 0.46
Standard lunch: 0.54


## Evaluation 3: students who performed NO preperation
Evaluation of the students who performed no course preperation. <br>
Calculates per group the mean absolute difference and the mean total difference between predicted and actual grade, and the fraction of cases where the predicted grade was higher than the actual grade.

In [103]:
# Evaluation of the results where NO test preparation was done for the course
print("No preperation")
print("Group A average error total: {}, absolute: {}".format(round(np.mean(NP_groupA_df['tot diff']),2), round(np.mean(NP_groupA_df['abs diff']),2)))
print("Group B average error total: {}, absolute: {}".format(round(np.mean(NP_groupB_df['tot diff']),2), round(np.mean(NP_groupB_df['abs diff']),2)))
print("Group C average error total: {}, absolute: {}".format(round(np.mean(NP_groupC_df['tot diff']),2), round(np.mean(NP_groupC_df['abs diff']),2)))
print("Group D average error total: {}, absolute: {}".format(round(np.mean(NP_groupD_df['tot diff']),2), round(np.mean(NP_groupD_df['abs diff']),2)))
print("Group E average error total: {}, absolute: {}".format(round(np.mean(NP_groupE_df['tot diff']),2), round(np.mean(NP_groupE_df['abs diff']),2)))

print("\nWomen average error total: {}, absolute: {}".format(round(np.mean(NP_W_df['tot diff']),2), round(np.mean(NP_W_df['abs diff']),2)))
print("Men average error total: {}, absolute: {}".format(round(np.mean(NP_M_df['tot diff']),2), round(np.mean(NP_M_df['abs diff']),2)))

print("\nFree lunch average error total: {}, absolute: {}".format(round(np.mean(NP_FL_df['tot diff']),2), round(np.mean(NP_FL_df['abs diff']),2)))
print("Standard lunch average error total: {}, absolute: {}".format(round(np.mean(NP_SL_df['tot diff']),2), round(np.mean(NP_SL_df['abs diff']),2)))

NP_pos_A = len(NP_groupA_df.loc[NP_groupA_df['tot diff'] >= 0])/len(NP_groupA_df)
NP_pos_B = len(NP_groupB_df.loc[NP_groupB_df['tot diff'] >= 0])/len(NP_groupB_df)
NP_pos_C = len(NP_groupC_df.loc[NP_groupC_df['tot diff'] >= 0])/len(NP_groupC_df)
NP_pos_D = len(NP_groupD_df.loc[NP_groupD_df['tot diff'] >= 0])/len(NP_groupD_df)
NP_pos_E = len(NP_groupE_df.loc[NP_groupE_df['tot diff'] >= 0])/len(NP_groupE_df)

NP_pos_W = len(NP_W_df.loc[NP_W_df['tot diff'] >= 0])/len(NP_W_df)
NP_pos_M = len(NP_M_df.loc[NP_M_df['tot diff'] >= 0])/len(NP_M_df)

NP_pos_FL = len(NP_FL_df.loc[NP_FL_df['tot diff'] >= 0])/len(NP_FL_df)
NP_pos_SL = len(NP_SL_df.loc[NP_SL_df['tot diff'] >= 0])/len(NP_SL_df)

print("\nFractions higher prediction:")
print("Group A:", round(NP_pos_A,2))
print("Group B:", round(NP_pos_B,2))
print("Group C:", round(NP_pos_C,2))
print("Group D:", round(NP_pos_D,2))
print("Group E:", round(NP_pos_E,2))

print("\nWomen:", round(NP_pos_W,2))
print("Men:", round(NP_pos_M,2))

print("\nFree lunch:", round(NP_pos_FL,2))
print("Standard lunch:", round(NP_pos_SL,2))

No preperation
Group A average error total: -0.93, absolute: 2.55
Group B average error total: 0.25, absolute: 2.37
Group C average error total: 0.68, absolute: 3.43
Group D average error total: -0.04, absolute: 2.34
Group E average error total: 1.41, absolute: 2.69

Women average error total: 0.36, absolute: 2.97
Men average error total: 0.24, absolute: 2.54

Free lunch average error total: -0.08, absolute: 2.75
Standard lunch average error total: 0.46, absolute: 2.75

Percentages higher prediction:
Group A: 0.48
Group B: 0.53
Group C: 0.54
Group D: 0.55
Group E: 0.7

Women: 0.53
Men: 0.58

Free lunch: 0.53
Standard lunch: 0.56
