In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score





In [2]:
# ------------- LOAD DATA -------------
url = './2020_bn_nb_data.txt'

data = pd.read_csv(url,sep='\t')

# ------------- PREPROCESS DATA -------------
'''Converting grades to numbers'''
ordinal_enc = OrdinalEncoder(categories=[['AA','AB','BB','BC','CC','CD','DD','F']]*(data.shape[1]-1))
data.iloc[:,:-1] = ordinal_enc.fit_transform(data.iloc[:,:-1])



# CPT Calculation

In [3]:
def calculate_cpt(data,course, given):
    rows = []
    for grade in data[given].drop_duplicates().values:
        condition = (data[given]==grade).all(axis=1)
        subset = data[condition][course]
        probability = subset.value_counts(normalize = True)
        
        for g,p in probability.items():
            row = {
                given[i]:grade[i]
                for i in range(len(given))
            }

            row['Grade'] = g
            row['Probability'] = p
            rows.append(row)


    return pd.DataFrame(rows)

In [4]:
ph100_cpt = calculate_cpt(data,'PH100', ['EC100', 'IT101',"MA101"])
ph100_cpt

Unnamed: 0,EC100,IT101,MA101,Grade,Probability
0,3.0,2.0,4.0,3.0,0.500000
1,3.0,2.0,4.0,4.0,0.500000
2,4.0,2.0,4.0,3.0,0.500000
3,4.0,2.0,4.0,5.0,0.500000
4,1.0,1.0,2.0,4.0,0.333333
...,...,...,...,...,...
170,0.0,1.0,0.0,1.0,1.000000
171,6.0,7.0,5.0,6.0,1.000000
172,2.0,0.0,1.0,1.0,1.000000
173,4.0,1.0,2.0,5.0,1.000000


Split Data

In [5]:

X = data.iloc[:,:-1]
Y = data.iloc[:,-1]

# Prediction : Assuming independent

TRAIN MODEL

In [6]:
for _ in range(20):
    x_train, x_test, y_train,y_test = train_test_split(X,Y, test_size = 0.3)
    
    print(f"--------------------- CYCLE {_} ---------------------")
    GNBclassifier = GaussianNB()
    GNBclassifier.fit(x_train,y_train)
    
    y_test_pred = GNBclassifier.predict(x_test)
    print(f"Accuracy of the model: {accuracy_score(y_test,y_test_pred)*100}%")

    probabilities = GNBclassifier.predict_proba(x_test)
    for i in range(min(5,len(probabilities))):
        print(f"Grades: {x_test.iloc[i].values} | Prediction: {y_test_pred[i]} | Pr: {probabilities[i][0]*100}%")
    print("\n")
    


--------------------- CYCLE 0 ---------------------
Accuracy of the model: 97.14285714285714%
Grades: [0.0 0.0 0.0 0.0 1.0 0.0 2.0 0.0] | Prediction: y | Pr: 3.5798570396312304e-29%
Grades: [4.0 4.0 3.0 3.0 4.0 3.0 3.0 2.0] | Prediction: y | Pr: 0.0013760193562383344%
Grades: [7.0 6.0 7.0 6.0 7.0 7.0 5.0 6.0] | Prediction: n | Pr: 99.99999999999343%
Grades: [6.0 5.0 6.0 6.0 6.0 7.0 3.0 5.0] | Prediction: n | Pr: 99.99999988084394%
Grades: [5.0 4.0 4.0 3.0 5.0 6.0 5.0 4.0] | Prediction: n | Pr: 99.74617517416688%


--------------------- CYCLE 1 ---------------------
Accuracy of the model: 98.57142857142858%
Grades: [7.0 5.0 5.0 6.0 5.0 6.0 4.0 5.0] | Prediction: n | Pr: 99.99999869716501%
Grades: [5.0 3.0 1.0 0.0 5.0 5.0 3.0 0.0] | Prediction: y | Pr: 5.851487409937941e-08%
Grades: [1.0 2.0 3.0 2.0 1.0 0.0 2.0 1.0] | Prediction: y | Pr: 1.5738673072502585e-17%
Grades: [2.0 4.0 0.0 3.0 3.0 1.0 3.0 1.0] | Prediction: y | Pr: 1.318340862908544e-13%
Grades: [7.0 6.0 6.0 6.0 7.0 6.0 6.0 6.0]

# Prediction : Assuming dependent

In [7]:
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import ParameterEstimator, MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination


In [13]:
for _ in range(20):
    train_data, test_data = train_test_split(data, test_size = 0.3)
    
    print(f"--------------------- CYCLE {_} ---------------------")
    model = BayesianNetwork([
        ('EC100','EC160'),
        ('PH100','PH160'),
        ('IT101','IT161'),
        ('PH100','MA101'),
        ('PH100','HS101'),
        ('IT101','QP')
        
    ])
    model.fit(train_data,estimator=MaximumLikelihoodEstimator)
    inference = VariableElimination(model)

    for i,row in test_data.head(5).iterrows():
        try:
            evi = {
            col: row[col] for col in train_data.columns if col!= 'QP'
        }
            query_result = inference.query(variables=['QP'],
                                    evidence=evi)
            print(f"Evidence: {evi} | Result: {query_result}")
        except KeyError as e:
            print(e)
    


--------------------- CYCLE 0 ---------------------
Evidence: {'EC100': 5.0, 'EC160': 6.0, 'IT101': 3.0, 'IT161': 3.0, 'MA101': 5.0, 'PH100': 5.0, 'PH160': 3.0, 'HS101': 2.0} | Result: +-------+-----------+
| QP    |   phi(QP) |
| QP(n) |    0.0333 |
+-------+-----------+
| QP(y) |    0.9667 |
+-------+-----------+
Evidence: {'EC100': 3.0, 'EC160': 1.0, 'IT101': 5.0, 'IT161': 3.0, 'MA101': 3.0, 'PH100': 3.0, 'PH160': 3.0, 'HS101': 5.0} | Result: +-------+-----------+
| QP    |   phi(QP) |
| QP(n) |    0.5909 |
+-------+-----------+
| QP(y) |    0.4091 |
+-------+-----------+
Evidence: {'EC100': 1.0, 'EC160': 0.0, 'IT101': 1.0, 'IT161': 1.0, 'MA101': 2.0, 'PH100': 1.0, 'PH160': 0.0, 'HS101': 1.0} | Result: +-------+-----------+
| QP    |   phi(QP) |
| QP(n) |    0.0000 |
+-------+-----------+
| QP(y) |    1.0000 |
+-------+-----------+
Evidence: {'EC100': 7.0, 'EC160': 6.0, 'IT101': 7.0, 'IT161': 6.0, 'MA101': 7.0, 'PH100': 7.0, 'PH160': 6.0, 'HS101': 7.0} | Result: +-------+-----------