# Ejemplo de Machine Learning de la materia Sistemas Embebidos
## Predicción calificación alumnos

Este proyecto trata de predecir la calificación de los alumnos con algoritmos de Machine Learning (Random Forest y SVM)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
datos = pd.read_csv('estudiantesDS/student_performance_updated_1000.csv')
datos.columns

Index(['StudentID', 'Name', 'Gender', 'AttendanceRate', 'StudyHoursPerWeek',
       'PreviousGrade', 'ExtracurricularActivities', 'ParentalSupport',
       'FinalGrade', 'Study Hours', 'Attendance (%)', 'Online Classes Taken'],
      dtype='object')

In [3]:
datos.head()

Unnamed: 0,StudentID,Name,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade,Study Hours,Attendance (%),Online Classes Taken
0,1.0,John,Male,85.0,15.0,78.0,1.0,High,80.0,4.8,59.0,False
1,2.0,Sarah,Female,90.0,20.0,85.0,2.0,Medium,87.0,2.2,70.0,True
2,3.0,Alex,Male,78.0,10.0,65.0,0.0,Low,68.0,4.6,92.0,False
3,4.0,Michael,Male,92.0,25.0,90.0,3.0,High,92.0,2.9,96.0,False
4,5.0,Emma,Female,,18.0,82.0,2.0,Medium,85.0,4.1,97.0,True


In [75]:
from sklearn.utils import resample

df = pd.read_csv('estudiantesDS/student_performance_updated_1000.csv')
clase_mayoritaria = df[df['FinalGrade'] >= 70.0]
clase_minoritaria = df[df['FinalGrade'] < 70.0]
    
# Submuestrear la clase mayoritaria
clase_mayoritaria_balanced = resample(clase_mayoritaria,
                                        replace=False,    # Sin reemplazo
                                        n_samples=len(clase_minoritaria),  # Igualar tamaños
                                        random_state=42)
    
# Combinar
df_balanced = pd.concat([clase_mayoritaria_balanced, clase_minoritaria])
datos_balanceados = df_balanced.sample(frac=1, random_state=42)  # Mezclar
datos_balanceados
datos = datos_balanceados

In [3]:
datos.head()

Unnamed: 0,StudentID,Name,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade,Study Hours,Attendance (%),Online Classes Taken
0,1.0,John,Male,85.0,15.0,78.0,1.0,High,80.0,4.8,59.0,False
1,2.0,Sarah,Female,90.0,20.0,85.0,2.0,Medium,87.0,2.2,70.0,True
2,3.0,Alex,Male,78.0,10.0,65.0,0.0,Low,68.0,4.6,92.0,False
3,4.0,Michael,Male,92.0,25.0,90.0,3.0,High,92.0,2.9,96.0,False
4,5.0,Emma,Female,,18.0,82.0,2.0,Medium,85.0,4.1,97.0,True


In [4]:
datos = datos.dropna()
datos

Unnamed: 0,StudentID,Name,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade,Study Hours,Attendance (%),Online Classes Taken
0,1.0,John,Male,85.0,15.0,78.0,1.0,High,80.0,4.8,59.0,False
1,2.0,Sarah,Female,90.0,20.0,85.0,2.0,Medium,87.0,2.2,70.0,True
2,3.0,Alex,Male,78.0,10.0,65.0,0.0,Low,68.0,4.6,92.0,False
3,4.0,Michael,Male,92.0,25.0,90.0,3.0,High,92.0,2.9,96.0,False
6,7.0,Daniel,Male,70.0,8.0,60.0,0.0,Low,62.0,4.5,96.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...
989,2116.0,Kimberly Pena,Female,91.0,30.0,88.0,2.0,High,68.0,3.6,79.0,False
991,7701.0,Anna Martinez,Male,85.0,30.0,70.0,3.0,Medium,90.0,0.4,76.0,False
993,3592.0,Monica Johnson,Female,90.0,25.0,60.0,1.0,Low,87.0,1.7,79.0,False
994,2787.0,Shannon Porter,Male,78.0,20.0,60.0,0.0,High,62.0,1.6,70.0,False


In [6]:
datos_utiles = datos[['AttendanceRate', 'StudyHoursPerWeek', 'PreviousGrade', 'ExtracurricularActivities','Attendance (%)']].copy()

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [8]:
datos_utiles['Gender_p'] = le.fit_transform(datos['Gender'].values)
datos_utiles['ParentalSupport_p'] = le.fit_transform(datos['ParentalSupport'].values)
datos_utiles['Online Classes Taken_p'] = le.fit_transform(datos['Online Classes Taken'].values)

In [9]:
datos_utiles.head()

Unnamed: 0,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,Attendance (%),Gender_p,ParentalSupport_p,Online Classes Taken_p
0,85.0,15.0,78.0,1.0,59.0,1,0,0
1,90.0,20.0,85.0,2.0,70.0,0,2,1
2,78.0,10.0,65.0,0.0,92.0,1,1,0
3,92.0,25.0,90.0,3.0,96.0,1,0,0
6,70.0,8.0,60.0,0.0,96.0,1,1,0


In [11]:
calificacion = pd.cut(datos['FinalGrade'], bins=[0,70,101],labels=['NoAprobado','Aprobado'],right=False)
calificacion = le.fit_transform(calificacion.values)
calificacion

array([0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,

In [13]:
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(datos_utiles, calificacion, test_size=0.33)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
y_pred = rf.predict(X_test)
cr = classification_report(y_test, y_pred)
accuracy_score(y_test, y_pred)

0.7934272300469484

In [84]:
print(cr)

              precision    recall  f1-score   support

           0       0.55      0.57      0.56        40
           1       0.57      0.55      0.56        42

    accuracy                           0.56        82
   macro avg       0.56      0.56      0.56        82
weighted avg       0.56      0.56      0.56        82



In [85]:
(y_pred, y_test)

(array([1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
        1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
        0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1]),
 array([1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
        1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
        1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1]))

In [86]:
print(cr)

              precision    recall  f1-score   support

           0       0.55      0.57      0.56        40
           1       0.57      0.55      0.56        42

    accuracy                           0.56        82
   macro avg       0.56      0.56      0.56        82
weighted avg       0.56      0.56      0.56        82



Comprobamos que hay mas aprobados que reprobados

In [88]:
import numpy as np
np.sum(calificacion==0)

np.int64(122)

In [89]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

accuracy_score(y_test, y_pred)

0.5121951219512195