In [20]:
import csv
import random
import math
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

In [21]:
def loadCsv(filename):
    lines=csv.reader(open(filename, 'r'))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [x for x in dataset[i]]
    return dataset

In [22]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [23]:
def std(numbers):
    if len(numbers) == 1:
        return 0
    else:
        avg = mean(numbers)
        variance = sum([pow(x-avg,2) for x in numbers]) / float(len(numbers)-1)
        return math.sqrt(variance)

In [24]:
def splitData(dataset, sRatio):
    trainSize = int(len(dataset) * sRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [25]:
def process(dataset):
    foreveryclass=[]
    for attribute in zip(*dataset):
        x = mean(attribute)
        y = std(attribute)
        foreveryclass.append([x,y])
    del foreveryclass[-1]
    return foreveryclass

In [26]:
def ClassData(dataset):
    classdivision = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if(vector[-1] not in classdivision):
            classdivision[vector[-1]] = []
        classdivision[vector[-1]].append(vector)
    return classdivision

In [27]:
def summary(dataset):
    divided = ClassData(dataset)
    PValues = {}
    for classValue, instances in divided.items():
        PValues[classValue] = process(instances)
    return PValues

In [28]:
def Prob(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

In [29]:
def ClassProb(ProcessValues, inputVector):
    probabilities = {}
    for classValue, classSummaries in ProcessValues.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *=Prob(x,mean,stdev)
    return probabilities

In [30]:
def predict(ProcessValues , inputVector):
    probabilities = ClassProb(ProcessValues, inputVector)
    bestLabel , bestProb = None,-1
    for classValue , probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [31]:
def getPredictions(ProcessValue , testSet):
    predictions = []
    y_true = []
    for i in range(len(testSet)):
        result = predict(ProcessValue , testSet[i])
        predictions.append(result)
    for i in range(len(testSet)):
        vector = testSet[i]
        y_true.append(vector[-1])
    return [y_true , predictions]

In [32]:
def getAccuracy(testSet , predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct +=1
    return (correct/float(len(testSet))) * 100.0

In [33]:
def cnvrt2flot(dataset):
    d=[]
    for i in dataset:
        d.append(list(map(float,i)))
    return d

In [72]:


def main(dataset):
    sRatio = 0.80
    #dataset = loadCsv(file)
    #dataset = dataset[1:]
    #dataset = cnvrt2flot(dataset)

    df = pd.read_csv(file)
    training, test = splitData(dataset, sRatio)
    redshift = [i[-2] for i in test]
    redshift_ranges = [1 if i <= 0.033 else 2 if i >=
                       0.004 else 3 for i in redshift]
    [i.pop(-2) for i in training]
    [i.pop(-2) for i in test]
    PV = summary(training)
    y_true, predictions = getPredictions(PV, test)
    rs = pd.Series(redshift_ranges)
    p = pd.Series(predictions)
    corrl = rs.corr(p)
    cm = confusion_matrix(y_true, predictions)
    # print('\n\n Confusion Matrix \n')
    # print('\n'.join([''.join(['{:4}'.format(item) for item in row]) for row in cm]))
    FP = cm.sum(axis=0) - np.diag(cm)
    FN = cm.sum(axis=1) - np.diag(cm)
    TP = np.diag(cm)
    TN = cm.sum() - (FP + FN + TP)
    # print('False Positives\n{}'.format(FP))
    # print('False Negatives\n{}'.format(FN))
    # print('True Positives\n{}'.format(TP))
    # print('True Negatives\n{}'.format(TN))
    # TPR = TP/(TP+FN)
    # print('Sensitivity \n {}'.format(TPR))
    # TNR = TN/(TN+FP)
    # print('Specificity \n {}'.format(TNR))
    #Precision = TP/(TP+FP)
    # print('Precision \n{}'.format(Precision))
    #Recall = TP/(TP+FN)
    # print('Recall \n{}'.format(Recall))
    Acc = (TP+TN)/(TP+TN+FP+FN)
    # print('Accuracy \n{}'.format(Acc))
    # Fscore = 2*(Precision*Recall)/(Precision+Recall)
    # print('Fscore \n{}'.format(Fscore))
    return Acc,corrl


In [38]:
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
clf = GaussianNB()
dataset=df1.values.tolist()
X = np.array(df1.drop('class',axis=1).values)
y = np.array(df1['class'].values)
scores = model_selection.cross_val_score(clf, X, y, cv=4, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

Accuracy: 0.92 (+/- 0.00)


In [79]:
import pandas as pd
if __name__ == "__main__":
    file = 'cat1.csv'
    df = pd.read_csv(file)
    df.drop(labels=['galex_objid', 'sdss_objid'], axis=1, inplace=True)
    class_column = df['class']
    srs_column = df['spectrometric_redshift']
    df = df.drop(labels=['spectrometric_redshift', 'pred', 'class'], axis=1)
    df['spectrometric_redshift'] = srs_column
    df['class'] = class_column
    
    df4 = df.drop(labels=['extinction_u','extinction_g','extinction_r','extinction_i','extinction_z'],axis=1)
    df2 = df.drop(labels=['nuv-u','nuv-g','nuv-r','nuv-i','nuv-z','u-g','u-r','u-i','u-z','g-r','g-i','g-z','r-i','r-z','i-z'],axis=1)
    df3 = df.drop(labels=['fuv-nuv','fuv-u','fuv-g','fuv-r','fuv-i','fuv-z'],axis=1)
    df1 = df.drop(labels=['u','g','i','z','extinction_u','extinction_g','extinction_r','extinction_i','extinction_z','nuv-u','nuv-g','nuv-i','nuv-z','u-g','u-r','u-i','u-z','g-r','g-i','g-z','r-i','r-z','i-z','fuv-nuv','fuv-u','fuv-g','fuv-r','fuv-i','fuv-z'],axis=1)
    dataset = df.values.tolist()
    random.seed(41)
    
    df_results = []
    Acc,corrl = main(dataset)
    Acc1,corrl1 = main(df1.values.tolist())
    Acc2,corrl2 = main(df2.values.tolist())
    Acc3,corrl3 = main(df3.values.tolist())
    Acc4,corrl4 = main(df4.values.tolist())
    data = {'DataSet':['df','df1','df2','df3','df4'], 'Accuracy':[Acc[0],Acc1[0],Acc2[0],Acc3[0],Acc4[0]], 'Correlation':[corrl,corrl1,corrl2,corrl3,corrl4]}
    df_results = pd.DataFrame(data)
    print(df_results)


  DataSet  Accuracy  Correlation
0      df  0.930769     0.679803
1     df1  0.900000     0.379566
2     df2  0.892308     0.516984
3     df3  0.900000     0.384900
4     df4  0.953846     0.661524


In [94]:

from sklearn.naive_bayes import GaussianNB
X = np.array(df.drop('class',axis=1).values)
y = np.array(df['class'].values)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=41,stratify=y)
model = GaussianNB()
model.fit(X,y)

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        model, X_train, y_train, X_test, y_test, 
        loss='0-1_loss',
        random_seed=41)
avg_loss=avg_expected_loss
avg_bias=avg_bias
avg_var=avg_var
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

Average expected loss: 0.014
Average bias: 0.015
Average variance: 0.005


In [95]:
from sklearn.naive_bayes import GaussianNB
X = np.array(df1.drop('class',axis=1).values)
y = np.array(df1['class'].values)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=41,stratify=y)
model = GaussianNB()
model.fit(X,y)

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        model, X_train, y_train, X_test, y_test, 
        loss='0-1_loss',
        random_seed=41)
avg_loss1=avg_expected_loss
avg_bias1=avg_bias
avg_var1=avg_var
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

Average expected loss: 0.007
Average bias: 0.005
Average variance: 0.003


In [96]:
from sklearn.naive_bayes import GaussianNB
X = np.array(df2.drop('class',axis=1).values)
y = np.array(df2['class'].values)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=41,stratify=y)
model = GaussianNB()
model.fit(X,y)

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        model, X_train, y_train, X_test, y_test, 
        loss='0-1_loss',
        random_seed=41)
avg_loss2=avg_expected_loss
avg_bias2=avg_bias
avg_var2=avg_var
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

Average expected loss: 0.024
Average bias: 0.021
Average variance: 0.005


In [97]:
from sklearn.naive_bayes import GaussianNB
X = np.array(df3.drop('class',axis=1).values)
y = np.array(df3['class'].values)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=41,stratify=y)
model = GaussianNB()
model.fit(X,y)

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        model, X_train, y_train, X_test, y_test, 
        loss='0-1_loss',
        random_seed=41)
avg_loss3=avg_expected_loss
avg_bias3=avg_bias
avg_var3=avg_var
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

Average expected loss: 0.018
Average bias: 0.021
Average variance: 0.006


In [98]:
from sklearn.naive_bayes import GaussianNB
X = np.array(df4.drop('class',axis=1).values)
y = np.array(df4['class'].values)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=41,stratify=y)
model = GaussianNB()
model.fit(X,y)

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        model, X_train, y_train, X_test, y_test, 
        loss='0-1_loss',
        random_seed=41)
avg_loss4=avg_expected_loss
avg_bias4=avg_bias
avg_var4=avg_var
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

Average expected loss: 0.012
Average bias: 0.010
Average variance: 0.005


In [99]:
    bv_results = []
    data = {'DataSet':['df','df1','df2','df3','df4'], 'Avg_exp_loss':[avg_loss,avg_loss1,avg_loss2,avg_loss3,avg_loss4], 'Avg_Bias':[avg_bias,avg_bias1,avg_bias2,avg_bias3,avg_bias4], 'Avg_Variance':[avg_var,avg_var1,avg_var2,avg_var3,avg_var4]}
    bv_results = pd.DataFrame(data)
    print(bv_results)

  DataSet  Avg_exp_loss  Avg_Bias  Avg_Variance
0      df      0.013744  0.010256      0.004846
1     df1      0.006744  0.005128      0.003462
2     df2      0.023513  0.020513      0.004590
3     df3      0.017513  0.020513      0.006128
4     df4      0.011513  0.010256      0.004846
