# Ejemplo de Machine Learning de la materia Sistemas Embebidos
## Predicción calificación alumnos

Este proyecto trata de predecir la calificación de los alumnos con algoritmos de Machine Learning (Random Forest y SVM)

In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [71]:
datos = pd.read_csv('estudiantesDS/student_performance_updated_1000.csv')
datos.columns

Index(['StudentID', 'Name', 'Gender', 'AttendanceRate', 'StudyHoursPerWeek',
       'PreviousGrade', 'ExtracurricularActivities', 'ParentalSupport',
       'FinalGrade', 'Study Hours', 'Attendance (%)', 'Online Classes Taken'],
      dtype='object')

In [72]:
datos.head()

Unnamed: 0,StudentID,Name,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade,Study Hours,Attendance (%),Online Classes Taken
0,1.0,John,Male,85.0,15.0,78.0,1.0,High,80.0,4.8,59.0,False
1,2.0,Sarah,Female,90.0,20.0,85.0,2.0,Medium,87.0,2.2,70.0,True
2,3.0,Alex,Male,78.0,10.0,65.0,0.0,Low,68.0,4.6,92.0,False
3,4.0,Michael,Male,92.0,25.0,90.0,3.0,High,92.0,2.9,96.0,False
4,5.0,Emma,Female,,18.0,82.0,2.0,Medium,85.0,4.1,97.0,True


In [74]:
from sklearn.utils import resample

df = pd.read_csv('estudiantesDS/student_performance_updated_1000.csv')
clase_mayoritaria = df[df['FinalGrade'] >= 70.0]
clase_minoritaria = df[df['FinalGrade'] < 70.0]
    
# Submuestrear la clase mayoritaria
clase_mayoritaria_balanced = resample(clase_mayoritaria,
                                        replace=False,    # Sin reemplazo
                                        n_samples=len(clase_minoritaria),  # Igualar tamaños
                                        random_state=42)
    
# Combinar
df_balanced = pd.concat([clase_mayoritaria_balanced, clase_minoritaria])
datos_balanceados = df_balanced.sample(frac=1, random_state=42)  # Mezclar
datos_balanceados
datos = datos_balanceados

In [75]:
datos.head()

Unnamed: 0,StudentID,Name,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade,Study Hours,Attendance (%),Online Classes Taken
52,5930.0,Maria Cross,Female,82.0,,82.0,,High,62.0,3.9,88.0,False
342,2493.0,Mrs. Heather Roy,Female,78.0,30.0,60.0,1.0,High,78.0,0.2,78.0,False
888,,Jordan Thornton,Female,78.0,17.0,85.0,1.0,Low,72.0,1.2,63.0,True
683,5965.0,Kenneth Jones,,85.0,8.0,65.0,2.0,Low,62.0,2.4,57.0,False
650,8306.0,David Jimenez,Female,90.0,30.0,78.0,3.0,Low,92.0,2.9,64.0,False


In [76]:
datos = datos.dropna()
datos

Unnamed: 0,StudentID,Name,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade,Study Hours,Attendance (%),Online Classes Taken
342,2493.0,Mrs. Heather Roy,Female,78.0,30.0,60.0,1.0,High,78.0,0.2,78.0,False
650,8306.0,David Jimenez,Female,90.0,30.0,78.0,3.0,Low,92.0,2.9,64.0,False
2,3.0,Alex,Male,78.0,10.0,65.0,0.0,Low,68.0,4.6,92.0,False
122,1258.0,Courtney Haas,Male,88.0,15.0,65.0,0.0,Low,87.0,3.7,80.0,False
818,3248.0,James Macdonald,Female,90.0,30.0,90.0,2.0,Medium,92.0,1.8,73.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
29,4510.0,Daniel Erickson,Male,91.0,22.0,77.0,3.0,Medium,62.0,1.0,56.0,False
723,1871.0,Deborah Reyes,Male,78.0,15.0,90.0,0.0,High,78.0,1.3,95.0,False
617,5648.0,Craig Mccann,Female,85.0,10.0,90.0,0.0,Low,85.0,2.6,76.0,True
874,7297.0,Melissa Donaldson,Female,91.0,20.0,70.0,3.0,Medium,68.0,4.1,66.0,False


In [82]:
#datos_utiles = datos[['AttendanceRate', 'StudyHoursPerWeek', 'PreviousGrade', 'ExtracurricularActivities','Attendance (%)']].copy()
datos_utiles = datos[['StudyHoursPerWeek', 'PreviousGrade']].copy()

In [83]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# Si no eligen las siguientes columnas no es necesario correr el siguiente bloque

In [79]:
datos_utiles['Gender_p'] = le.fit_transform(datos['Gender'].values)
datos_utiles['ParentalSupport_p'] = le.fit_transform(datos['ParentalSupport'].values)
datos_utiles['Online Classes Taken_p'] = le.fit_transform(datos['Online Classes Taken'].values)

In [84]:
datos_utiles.head()

Unnamed: 0,StudyHoursPerWeek,PreviousGrade
342,30.0,60.0
650,30.0,78.0
2,10.0,65.0
122,15.0,65.0
818,30.0,90.0


In [85]:
calificacion = pd.cut(datos['FinalGrade'], bins=[0,70,101],labels=['NoAprobado','Aprobado'],right=False)
calificacion[calificacion=='Aprobado'].count()

np.int64(122)

In [86]:
calificacion = le.fit_transform(calificacion.values)
calificacion


array([0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 0])

In [87]:
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(datos_utiles, calificacion, test_size=0.33, shuffle=True)


In [88]:
print(len(y_test))
y_test

82


array([0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0])

In [89]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [90]:
y_pred = rf.predict(X_test)
cr = classification_report(y_test, y_pred)
accuracy_score(y_test, y_pred)

0.5365853658536586

In [91]:
print(cr)

              precision    recall  f1-score   support

           0       0.57      0.52      0.55        44
           1       0.50      0.55      0.53        38

    accuracy                           0.54        82
   macro avg       0.54      0.54      0.54        82
weighted avg       0.54      0.54      0.54        82



In [67]:
(y_pred, y_test)

(array([0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
        1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
        0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
        1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0]),
 array([1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1,
        0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
        1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1]))

In [86]:
print(cr)

              precision    recall  f1-score   support

           0       0.55      0.57      0.56        40
           1       0.57      0.55      0.56        42

    accuracy                           0.56        82
   macro avg       0.56      0.56      0.56        82
weighted avg       0.56      0.56      0.56        82



Comprobamos que hay mas aprobados que reprobados

In [68]:
import numpy as np
np.sum(calificacion==0)

np.int64(122)

In [69]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

accuracy_score(y_test, y_pred)

0.5