In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score





In [3]:
# ------------- LOAD DATA -------------
url = './2020_bn_nb_data.txt'

data = pd.read_csv(url,sep='\t')

# ------------- PREPROCESS DATA -------------
'''Converting grades to numbers'''
ordinal_enc = OrdinalEncoder(categories=[['AA','AB','BB','BC','CC','CD','DD','F']]*(data.shape[1]-1))
data.iloc[:,:-1] = ordinal_enc.fit_transform(data.iloc[:,:-1])



# CPT Calculation

In [4]:
def calculate_cpt(data,course, given):
    rows = []
    for grade in data[given].drop_duplicates().values:
        condition = (data[given]==grade).all(axis=1)
        subset = data[condition][course]
        probability = subset.value_counts(normalize = True)
        
        for g,p in probability.items():
            row = {
                given[i]:grade[i]
                for i in range(len(given))
            }

            row['Grade'] = g
            row['Probability'] = p
            rows.append(row)


    return pd.DataFrame(rows)

In [5]:
ph100_cpt = calculate_cpt(data,'PH100', ['EC100', 'IT101',"MA101"])
ph100_cpt

Unnamed: 0,EC100,IT101,MA101,Grade,Probability
0,3.0,2.0,4.0,3.0,0.500000
1,3.0,2.0,4.0,4.0,0.500000
2,4.0,2.0,4.0,3.0,0.500000
3,4.0,2.0,4.0,5.0,0.500000
4,1.0,1.0,2.0,4.0,0.333333
...,...,...,...,...,...
170,0.0,1.0,0.0,1.0,1.000000
171,6.0,7.0,5.0,6.0,1.000000
172,2.0,0.0,1.0,1.0,1.000000
173,4.0,1.0,2.0,5.0,1.000000


Split Data

In [6]:

X = data.iloc[:,:-1]
Y = data.iloc[:,-1]

# Prediction : Assuming independent

TRAIN MODEL

In [7]:
for _ in range(20):
    x_train, x_test, y_train,y_test = train_test_split(X,Y, test_size = 0.3)
    
    print(f"--------------------- CYCLE {_} ---------------------")
    GNBclassifier = GaussianNB()
    GNBclassifier.fit(x_train,y_train)
    
    y_test_pred = GNBclassifier.predict(x_test)
    print(f"Accuracy of the model: {accuracy_score(y_test,y_test_pred)*100}%")

    probabilities = GNBclassifier.predict_proba(x_test)
    for i in range(min(5,len(probabilities))):
        print(f"Grades: {x_test.iloc[i].values} | Prediction: {y_test_pred[i]} | Pr: {probabilities[i][0]*100}%")
    print("\n")
    


--------------------- CYCLE 0 ---------------------
Accuracy of the model: 95.71428571428572%
Grades: [7.0 7.0 7.0 6.0 7.0 7.0 5.0 6.0] | Prediction: n | Pr: 99.99999999999964%
Grades: [6.0 5.0 5.0 3.0 3.0 5.0 0.0 3.0] | Prediction: n | Pr: 82.74654940013818%
Grades: [3.0 3.0 1.0 2.0 2.0 3.0 3.0 2.0] | Prediction: y | Pr: 9.243059732060494e-11%
Grades: [6.0 4.0 4.0 4.0 5.0 4.0 4.0 3.0] | Prediction: n | Pr: 98.78283913967445%
Grades: [4.0 3.0 1.0 3.0 1.0 3.0 2.0 1.0] | Prediction: y | Pr: 6.082769547626058e-10%


--------------------- CYCLE 1 ---------------------
Accuracy of the model: 95.71428571428572%
Grades: [6.0 5.0 6.0 5.0 5.0 5.0 3.0 6.0] | Prediction: n | Pr: 99.99999312525696%
Grades: [4.0 3.0 3.0 3.0 3.0 1.0 2.0 1.0] | Prediction: y | Pr: 8.280667637646849e-10%
Grades: [7.0 6.0 6.0 6.0 6.0 5.0 0.0 6.0] | Prediction: n | Pr: 99.99999995687219%
Grades: [3.0 4.0 1.0 1.0 2.0 0.0 1.0 0.0] | Prediction: y | Pr: 8.648585360608024e-18%
Grades: [5.0 4.0 3.0 4.0 3.0 3.0 3.0 1.0] | Pre

# Prediction : Assuming dependent

In [10]:
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import ParameterEstimator, MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination


In [11]:
for _ in range(20):
    train_data, test_data = train_test_split(data, test_size = 0.3)
    
    print(f"--------------------- CYCLE {_} ---------------------")
    model = BayesianNetwork([
        ('EC100','EC160'),
        ('PH100','Ph160'),
        ('1T101','IT161'),
        ('PH100','MA101')
        
    ])
    model.fit(train_data,estimator=MaximumLikelihoodEstimator)
    inference = VariableElimination(model)

    for i,row in test_data.iterrows():
        evi = {
            col: row[col] for col in train_data.columns if col!= 'Internship'
        }
        query_result = inference.query(variables=['Internship'],
                                   evidence=evi)
        print(f"Evidence: {evi} | Result: {query_result}")
    


--------------------- CYCLE 0 ---------------------


KeyError: 'Ph160'