In [60]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Fetch dataset
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
student_performance = fetch_ucirepo(id=320)

# data (as pandas dataframes) 
x = student_performance.data.features
y = student_performance.data.targets

print(x.columns)


Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences'],
      dtype='object')


In [61]:
type(student_performance)

ucimlrepo.dotdict.dotdict

In [62]:
print(student_performance.keys())

dict_keys(['data', 'metadata', 'variables'])


In [63]:
data = student_performance['data']
print(type(data))
print(data)  # Print the entire object or a slice if it's too large

<class 'ucimlrepo.dotdict.dotdict'>
{'ids': None, 'features':     school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   at_home   teacher   
1       GP   F   17       U     GT3       T     1     1   at_home     other   
2       GP   F   15       U     LE3       T     1     1   at_home     other   
3       GP   F   15       U     GT3       T     4     2    health  services   
4       GP   F   16       U     GT3       T     3     3     other     other   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
644     MS   F   19       R     GT3       T     2     3  services     other   
645     MS   F   18       U     LE3       T     3     1   teacher  services   
646     MS   F   18       U     GT3       T     1     1     other     other   
647     MS   M   17       U     LE3       T     3     1  services  services   
648     MS   M   18       R     LE3       T     3     2  services    

In [64]:
df = pd.DataFrame(student_performance['data']['features'])
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,yes,no,no,4,3,4,1,1,3,4
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,yes,no,5,3,3,1,1,3,2
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,yes,no,4,3,2,2,3,3,6
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,yes,3,2,2,1,1,5,0
4,GP,F,16,U,GT3,T,3,3,other,other,...,yes,no,no,4,3,2,1,2,5,0


In [72]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

# Initialize LabelEncoder
le = LabelEncoder()

# Apply Label Encoding to each categorical column
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# le = LabelEncoder()
# df['famsize'] = le.fit_transform(df['famsize'])
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,0,0,18,1,0,0,4,4,0,4,...,1,0,0,4,3,4,1,1,3,4
1,0,0,17,1,0,1,1,1,0,2,...,1,1,0,5,3,3,1,1,3,2
2,0,0,15,1,1,1,1,1,0,2,...,1,1,0,4,3,2,2,3,3,6
3,0,0,15,1,0,1,4,2,1,3,...,1,1,1,3,2,2,1,1,5,0
4,0,0,16,1,0,1,3,3,2,2,...,1,0,0,4,3,2,1,2,5,0


In [75]:
df.corr()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
school,1.0,-0.08305,0.08717,-0.35452,0.022252,0.02812,-0.254787,-0.209806,-0.206829,-0.081872,...,-0.136112,-0.240486,0.072241,-0.031597,0.034666,0.044632,0.047169,0.014169,-0.058599,-0.163933
sex,-0.08305,1.0,-0.043662,0.025503,0.098205,0.0647,0.119127,0.083913,0.149635,0.080466,...,-0.058134,0.065911,-0.110144,0.083473,0.146305,0.058178,0.282696,0.320785,0.139547,0.021336
age,0.08717,-0.043662,1.0,-0.025848,-0.00247,-0.005631,-0.107832,-0.12105,-0.07177,-0.050846,...,-0.265497,0.013115,0.17881,-0.020559,-0.00491,0.112805,0.134768,0.086357,-0.00875,0.149998
address,-0.35452,0.025503,-0.025848,1.0,0.046113,-0.094635,0.19032,0.141493,0.159761,-0.006535,...,0.076706,0.175794,-0.030939,-0.033897,-0.036647,0.015475,-0.047304,-0.012416,0.003787,0.073653
famsize,0.022252,0.098205,-0.00247,0.046113,1.0,-0.239608,-0.014325,-0.039538,0.0196,-0.059443,...,0.004523,0.013357,-0.032936,0.004641,-0.021257,-0.004312,0.060482,0.081958,0.002448,0.004645
Pstatus,0.02812,0.0647,-0.005631,-0.094635,-0.239608,1.0,-0.057174,-0.031856,-0.028874,0.054306,...,0.022726,0.059754,-0.053828,0.051303,0.037585,0.031086,0.041513,0.070976,0.012638,-0.117492
Medu,-0.254787,0.119127,-0.107832,0.19032,-0.014325,-0.057174,1.0,0.647477,0.459337,0.152582,...,0.213896,0.266052,-0.030992,0.024421,-0.019686,0.009536,-0.007018,-0.019766,0.004614,-0.008577
Fedu,-0.209806,0.083913,-0.12105,0.141493,-0.039538,-0.031856,0.647477,1.0,0.290703,0.211604,...,0.191735,0.183483,-0.067675,0.020256,0.006841,0.02769,6.1e-05,0.038445,0.04491,0.029859
Mjob,-0.206829,0.149635,-0.07177,0.159761,0.0196,-0.028874,0.459337,0.290703,1.0,0.202651,...,0.148116,0.260658,-0.074286,0.025049,0.053927,0.003182,0.049576,0.025657,0.081525,0.028519
Fjob,-0.081872,0.080466,-0.050846,-0.006535,-0.059443,0.054306,0.152582,0.211604,0.202651,1.0,...,0.089929,0.088625,-0.002835,0.0395,-0.037952,-0.031913,0.055389,0.044607,-0.025069,-0.047477


In [76]:
# Identify numeric columns (assuming you've already encoded categorical variables)
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Initialize StandardScaler
scaler = StandardScaler()

# Standardize the numeric columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
df.corr()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
school,1.0,-0.08305,0.08717,-0.35452,0.022252,0.02812,-0.254787,-0.209806,-0.206829,-0.081872,...,-0.136112,-0.240486,0.072241,-0.031597,0.034666,0.044632,0.047169,0.014169,-0.058599,-0.163933
sex,-0.08305,1.0,-0.043662,0.025503,0.098205,0.0647,0.119127,0.083913,0.149635,0.080466,...,-0.058134,0.065911,-0.110144,0.083473,0.146305,0.058178,0.282696,0.320785,0.139547,0.021336
age,0.08717,-0.043662,1.0,-0.025848,-0.00247,-0.005631,-0.107832,-0.12105,-0.07177,-0.050846,...,-0.265497,0.013115,0.17881,-0.020559,-0.00491,0.112805,0.134768,0.086357,-0.00875,0.149998
address,-0.35452,0.025503,-0.025848,1.0,0.046113,-0.094635,0.19032,0.141493,0.159761,-0.006535,...,0.076706,0.175794,-0.030939,-0.033897,-0.036647,0.015475,-0.047304,-0.012416,0.003787,0.073653
famsize,0.022252,0.098205,-0.00247,0.046113,1.0,-0.239608,-0.014325,-0.039538,0.0196,-0.059443,...,0.004523,0.013357,-0.032936,0.004641,-0.021257,-0.004312,0.060482,0.081958,0.002448,0.004645
Pstatus,0.02812,0.0647,-0.005631,-0.094635,-0.239608,1.0,-0.057174,-0.031856,-0.028874,0.054306,...,0.022726,0.059754,-0.053828,0.051303,0.037585,0.031086,0.041513,0.070976,0.012638,-0.117492
Medu,-0.254787,0.119127,-0.107832,0.19032,-0.014325,-0.057174,1.0,0.647477,0.459337,0.152582,...,0.213896,0.266052,-0.030992,0.024421,-0.019686,0.009536,-0.007018,-0.019766,0.004614,-0.008577
Fedu,-0.209806,0.083913,-0.12105,0.141493,-0.039538,-0.031856,0.647477,1.0,0.290703,0.211604,...,0.191735,0.183483,-0.067675,0.020256,0.006841,0.02769,6.1e-05,0.038445,0.04491,0.029859
Mjob,-0.206829,0.149635,-0.07177,0.159761,0.0196,-0.028874,0.459337,0.290703,1.0,0.202651,...,0.148116,0.260658,-0.074286,0.025049,0.053927,0.003182,0.049576,0.025657,0.081525,0.028519
Fjob,-0.081872,0.080466,-0.050846,-0.006535,-0.059443,0.054306,0.152582,0.211604,0.202651,1.0,...,0.089929,0.088625,-0.002835,0.0395,-0.037952,-0.031913,0.055389,0.044607,-0.025069,-0.047477
