In [67]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.tree import DecisionTreeRegressor

In [68]:
# Read the first dataset (math grades)
math_df = pd.read_csv('student-mat.csv', sep=';')
# Read the second dataset (portuguese grades)
portuguese_df = pd.read_csv('student-por.csv', sep=';')
# Identify the common columns for merging
common_columns = ["school", "sex", "age", "address", "famsize", "Pstatus", "Medu", "Fedu", "Mjob", "Fjob", "reason", "guardian", "nursery", "internet"]
# Merge the two DataFrames using an inner join on the common columns
merged_df = pd.merge(math_df, portuguese_df, on=common_columns, how='inner')
# Now 'merged_df' contains the combined 
merged_df.to_csv('merged_student.csv', index=False, sep=',')

math_df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


In [69]:
dataset = pd.read_csv('merged_student.csv')
dataset

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel_y,freetime_y,goout_y,Dalc_y,Walc_y,health_y,absences_y,G1_y,G2_y,G3_y
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,MS,F,18,U,LE3,T,3,1,teacher,services,...,4,3,4,1,1,1,4,15,15,16
372,MS,F,18,U,GT3,T,1,1,other,other,...,3,4,4,2,2,5,3,7,8,7
373,MS,F,18,U,GT3,T,1,1,other,other,...,1,1,1,1,1,5,6,11,12,9
374,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,6,10,10,10


In [70]:
columns = [
    'studytime_x',
    'traveltime_x',
    'activities_x',
    'freetime_x',
    'goout_x',
    'absences_x',
    'G1_x',
    'G2_x',
    'G3_x',
           ]
model_df = dataset[columns].copy()
label = LabelEncoder()
label.fit(model_df.activities_x)
model_df.activities_x = label.transform(model_df.activities_x)
model_df


Unnamed: 0,studytime_x,traveltime_x,activities_x,freetime_x,goout_x,absences_x,G1_x,G2_x,G3_x
0,2,2,0,3,4,6,5,6,6
1,2,1,0,3,3,4,5,5,6
2,2,1,0,3,2,10,7,8,10
3,3,1,1,2,2,2,15,14,15
4,2,1,0,3,2,4,6,10,10
...,...,...,...,...,...,...,...,...,...
371,2,1,0,3,4,0,7,9,8
372,2,2,1,1,1,0,6,5,0
373,2,2,1,1,1,0,6,5,0
374,1,2,0,4,5,3,14,16,16


In [71]:
x = model_df.drop(['G3_x'], axis = 1)
y = model_df[['G3_x']]
x_train, x_test, y_train, y_test, = train_test_split(x.values, y.values, test_size=0.3, random_state=42)

In [72]:
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)

In [73]:
linear_coef = linear_model.coef_
linear_coef

array([[-0.19143195,  0.14739375, -0.33936958,  0.1508943 ,  0.04396979,
         0.03686728,  0.14818166,  1.00786487]])

In [74]:
linear_intercept = linear_model.intercept_

In [75]:
linear_intercept[0]

np.float64(-2.511793908243847)

In [76]:
x_train

array([[ 2,  1,  1, ...,  2, 15, 15],
       [ 2,  1,  1, ..., 11, 16, 15],
       [ 1,  1,  1, ..., 10, 10,  8],
       ...,
       [ 2,  2,  0, ...,  2, 11, 11],
       [ 2,  1,  1, ...,  0, 16, 15],
       [ 3,  1,  1, ...,  0, 16, 17]], shape=(263, 8))

In [77]:
prediction = linear_model.predict(x_test)
prediction

array([[14.79173016],
       [ 4.52367506],
       [ 7.85570196],
       [12.59411937],
       [-0.99149765],
       [ 5.43179323],
       [13.25009161],
       [14.64308   ],
       [10.4248329 ],
       [13.87133637],
       [ 7.95704764],
       [13.66679655],
       [ 8.71809085],
       [ 5.25647591],
       [13.30562673],
       [12.92913886],
       [ 6.89354903],
       [14.88534603],
       [ 6.18997604],
       [ 8.52337538],
       [14.55574485],
       [16.28955907],
       [11.70892262],
       [ 5.03799299],
       [11.59171375],
       [15.13351951],
       [ 8.30991186],
       [12.47484617],
       [14.90432398],
       [ 9.7339066 ],
       [ 3.61613583],
       [ 7.53898099],
       [ 7.37274018],
       [18.52871245],
       [ 8.31206221],
       [13.07317585],
       [12.09205193],
       [17.89273551],
       [ 6.68758647],
       [10.10337432],
       [12.00131735],
       [15.12624021],
       [ 6.11399036],
       [13.0446647 ],
       [ 9.45804259],
       [13

In [78]:
x_test

array([[ 3,  1,  0,  2,  2,  0, 15, 15],
       [ 1,  1,  1,  5,  5, 14,  6,  5],
       [ 3,  1,  1,  2,  3,  4, 10,  9],
       [ 1,  1,  0,  2,  2,  8, 16, 12],
       [ 1,  1,  0,  4,  5,  0,  5,  0],
       [ 2,  1,  0,  1,  3, 18,  8,  6],
       [ 1,  1,  0,  4,  4, 12, 10, 13],
       [ 2,  1,  1,  2,  2,  4, 14, 15],
       [ 4,  2,  1,  4,  3,  8, 11, 11],
       [ 2,  1,  1,  2,  4,  4, 15, 14],
       [ 2,  4,  1,  3,  2, 14, 10,  8],
       [ 1,  2,  0,  5,  5, 10, 11, 13],
       [ 2,  1,  1,  3,  2,  0,  8, 10],
       [ 1,  2,  1,  4,  3,  5,  7,  6],
       [ 2,  1,  1,  4,  3,  9, 15, 13],
       [ 2,  1,  1,  4,  2,  4, 14, 13],
       [ 2,  1,  0,  3,  2,  0,  7,  8],
       [ 3,  1,  1,  4,  5,  4, 14, 15],
       [ 2,  1,  1,  4,  4, 15,  6,  7],
       [ 3,  2,  1,  3,  3, 10, 11,  9],
       [ 1,  1,  0,  4,  4,  4, 14, 14],
       [ 1,  2,  1,  3,  2,  0, 16, 16],
       [ 1,  1,  1,  4,  4, 18, 14, 11],
       [ 2,  2,  0,  3,  4,  6,  5,  6],
       [ 1,  1, 

In [79]:
prediction

array([[14.79173016],
       [ 4.52367506],
       [ 7.85570196],
       [12.59411937],
       [-0.99149765],
       [ 5.43179323],
       [13.25009161],
       [14.64308   ],
       [10.4248329 ],
       [13.87133637],
       [ 7.95704764],
       [13.66679655],
       [ 8.71809085],
       [ 5.25647591],
       [13.30562673],
       [12.92913886],
       [ 6.89354903],
       [14.88534603],
       [ 6.18997604],
       [ 8.52337538],
       [14.55574485],
       [16.28955907],
       [11.70892262],
       [ 5.03799299],
       [11.59171375],
       [15.13351951],
       [ 8.30991186],
       [12.47484617],
       [14.90432398],
       [ 9.7339066 ],
       [ 3.61613583],
       [ 7.53898099],
       [ 7.37274018],
       [18.52871245],
       [ 8.31206221],
       [13.07317585],
       [12.09205193],
       [17.89273551],
       [ 6.68758647],
       [10.10337432],
       [12.00131735],
       [15.12624021],
       [ 6.11399036],
       [13.0446647 ],
       [ 9.45804259],
       [13

In [80]:
y_test

array([[15],
       [ 5],
       [ 9],
       [13],
       [ 0],
       [ 7],
       [12],
       [15],
       [10],
       [14],
       [ 9],
       [13],
       [12],
       [ 7],
       [15],
       [13],
       [ 0],
       [16],
       [ 8],
       [10],
       [14],
       [15],
       [13],
       [ 6],
       [14],
       [15],
       [ 8],
       [11],
       [15],
       [10],
       [ 5],
       [10],
       [10],
       [18],
       [ 9],
       [13],
       [11],
       [18],
       [ 5],
       [11],
       [13],
       [16],
       [ 7],
       [12],
       [11],
       [13],
       [12],
       [10],
       [15],
       [13],
       [11],
       [10],
       [12],
       [15],
       [ 9],
       [ 6],
       [12],
       [ 9],
       [19],
       [ 8],
       [ 7],
       [16],
       [ 0],
       [15],
       [ 5],
       [12],
       [11],
       [ 9],
       [15],
       [11],
       [19],
       [ 5],
       [10],
       [ 8],
       [12],
       [10],
       [13],

In [81]:
linear_model.score(x,y)



0.8323832304872977

In [82]:
mean_squared_error(y_test,prediction)

2.4539869620160077

In [83]:
lasso_model = Lasso()
lasso_model.fit(x_train, y_train)

In [84]:
y_lasso = lasso_model.predict(x_test)
lasso_model.score(x, y)



0.8249477546642472

In [85]:
mean_squared_error(y_test, y_lasso)

2.486853521041876

In [86]:
x

Unnamed: 0,studytime_x,traveltime_x,activities_x,freetime_x,goout_x,absences_x,G1_x,G2_x
0,2,2,0,3,4,6,5,6
1,2,1,0,3,3,4,5,5
2,2,1,0,3,2,10,7,8
3,3,1,1,2,2,2,15,14
4,2,1,0,3,2,4,6,10
...,...,...,...,...,...,...,...,...
371,2,1,0,3,4,0,7,9
372,2,2,1,1,1,0,6,5
373,2,2,1,1,1,0,6,5
374,1,2,0,4,5,3,14,16


In [87]:
ridge_model = Ridge()
ridge_model.fit(x_train, y_train)

In [88]:
y_ridge = ridge_model.predict(x_test)
ridge_model.score(x,y)



0.832405026364195

In [89]:
mean_squared_error(y_test, y_ridge)

2.4524060524268476