In [49]:
import csv
import random
import math
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

In [71]:
def loadCsv(filename):
    lines=csv.reader(open(filename, 'r'))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [x for x in dataset[i]]
    return dataset

In [72]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [73]:
def std(numbers):
    if len(numbers) == 1:
        return 0
    else:
        avg = mean(numbers)
        variance = sum([pow(x-avg,2) for x in numbers]) / float(len(numbers)-1)
        return math.sqrt(variance)

In [74]:
def splitData(dataset, sRatio):
    trainSize = int(len(dataset) * sRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [54]:
def process(dataset):
    foreveryclass=[]
    for attribute in zip(*dataset):
        x = mean(attribute)
        y = std(attribute)
        foreveryclass.append([x,y])
    del foreveryclass[-1]
    return foreveryclass

In [55]:
def ClassData(dataset):
    classdivision = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if(vector[-1] not in classdivision):
            classdivision[vector[-1]] = []
        classdivision[vector[-1]].append(vector)
    return classdivision

In [56]:
def summary(dataset):
    divided = ClassData(dataset)
    PValues = {}
    for classValue, instances in divided.items():
        PValues[classValue] = process(instances)
    return PValues

In [57]:
def Prob(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

In [58]:
def ClassProb(ProcessValues, inputVector):
    probabilities = {}
    for classValue, classSummaries in ProcessValues.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *=Prob(x,mean,stdev)
    return probabilities

In [59]:
def predict(ProcessValues , inputVector):
    probabilities = ClassProb(ProcessValues, inputVector)
    bestLabel , bestProb = None,-1
    for classValue , probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [60]:
def getPredictions(ProcessValue , testSet):
    predictions = []
    y_true = []
    for i in range(len(testSet)):
        result = predict(ProcessValue , testSet[i])
        predictions.append(result)
    for i in range(len(testSet)):
        vector = testSet[i]
        y_true.append(vector[-1])
    return [y_true , predictions]

In [61]:
def getAccuracy(testSet , predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct +=1
    return (correct/float(len(testSet))) * 100.0

In [62]:
def cnvrt2flot(dataset):
    d=[]
    for i in dataset:
        d.append(list(map(float,i)))
    return d

In [112]:


def main(dataset):
    sRatio = 0.80
    #dataset = loadCsv(file)
    #dataset = dataset[1:]
    #dataset = cnvrt2flot(dataset)

    df = pd.read_csv(file)
    training, test = splitData(dataset, sRatio)
    redshift = [i[-2] for i in test]
    redshift_ranges = [1 if i <= 0.033 else 2 if i >=
                       0.004 else 3 for i in redshift]
    [i.pop(-2) for i in training]
    [i.pop(-2) for i in test]
    PV = summary(training)
    y_true, predictions = getPredictions(PV, test)
    rs = pd.Series(redshift_ranges)
    p = pd.Series(predictions)
    print("corr:{}".format(rs.corr(p)))
    cm = confusion_matrix(y_true, predictions)
    # print('\n\n Confusion Matrix \n')
    # print('\n'.join([''.join(['{:4}'.format(item) for item in row]) for row in cm]))
    FP = cm.sum(axis=0) - np.diag(cm)
    FN = cm.sum(axis=1) - np.diag(cm)
    TP = np.diag(cm)
    TN = cm.sum() - (FP + FN + TP)
    # print('False Positives\n{}'.format(FP))
    # print('False Negatives\n{}'.format(FN))
    # print('True Positives\n{}'.format(TP))
    # print('True Negatives\n{}'.format(TN))
    # TPR = TP/(TP+FN)
    # print('Sensitivity \n {}'.format(TPR))
    # TNR = TN/(TN+FP)
    # print('Specificity \n {}'.format(TNR))
    # Precision = TP/(TP+FP)
    # print('Precision \n{}'.format(Precision))
    # Recall = TP/(TP+FN)
    # print('Recall \n{}'.format(Recall))
    Acc = (TP+TN)/(TP+TN+FP+FN)
    # print('Accuracy \n{}'.format(Acc))
    # Fscore = 2*(Precision*Recall)/(Precision+Recall)
    # print('Fscore \n{}'.format(Fscore))
    return Acc


In [117]:
if __name__ == "__main__":
    file = 'cat1.csv'
    df = pd.read_csv(file)
    df.drop(labels=['galex_objid', 'sdss_objid'], axis=1)
    class_column = df['class']
    srs_column = df['spectrometric_redshift']
    df = df.drop(labels=['spectrometric_redshift', 'pred', 'class'], axis=1)
    df['spectrometric_redshift'] = srs_column
    df['class'] = class_column
    
    df4 = df.drop(labels=['extinction_u','extinction_g','extinction_r','extinction_i','extinction_z'],axis=1)
    df2 = df.drop(labels=['nuv-u','nuv-g','nuv-r','nuv-i','nuv-z','u-g','u-r','u-i','u-z','g-r','g-i','g-z','r-i','r-z','i-z'],axis=1)
    df3 = df.drop(labels=['fuv-nuv','fuv-u','fuv-g','fuv-r','fuv-i','fuv-z'],axis=1)
    df1 = df.drop(labels=['u','g','i','z','extinction_u','extinction_g','extinction_r','extinction_i','extinction_z','nuv-u','nuv-g','nuv-i','nuv-z','u-g','u-r','u-i','u-z','g-r','g-i','g-z','r-i','r-z','i-z','fuv-u','fuv-g','fuv-r','fuv-i','fuv-z'],axis=1)
    
        #dataset = df1.values.tolist()
    print("========================")
    dataset = df.values.tolist()
    random.seed(41)
    Acc = main(dataset)
    print("All Columns")
    print('Accuracy \n{}'.format(Acc[0]))
        #print("seed:"+str(i))
    print("-------------------------")
        
    Acc = main(df1.values.tolist())
    print("First set of Columns")
    print('Accuracy \n{}'.format(Acc[0]))
        #print("seed:"+str(i))
    print("-------------------------")
    Acc = main(df2.values.tolist())
    print("Second set of Columns")
    print('Accuracy \n{}'.format(Acc[0]))
        #print("seed:"+str(i))
    print("-------------------------")
    Acc = main(df3.values.tolist())
    print("Third set of Columns")
    print('Accuracy \n{}'.format(Acc[0]))
        #print("seed:"+str(i))
    print("-------------------------")
    Acc = main(df4.values.tolist())
    print("Fourth set of Columns")
    print('Accuracy \n{}'.format(Acc[0]))
        #print("seed:"+str(i))
    print("-------------------------")




corr:0.711351389345835
All Columns
Accuracy 
0.9384615384615385
-------------------------
corr:0.46004420222127024
First set of Columns
Accuracy 
0.9307692307692308
-------------------------
corr:0.5343059528700057
Second set of Columns
Accuracy 
0.9
-------------------------
corr:0.36533101260154843
Second set of Columns
Accuracy 
0.8923076923076924
-------------------------
corr:0.581834629888877
Second set of Columns
Accuracy 
0.9461538461538461
-------------------------


In [115]:

from mlxtend.evaluate import bias_variance_decomp
from sklearn.naive_bayes import GaussianNB



dataset=df1.values.tolist()
X = np.array(df1.drop('class',axis=1).values)
y = np.array(df1['class'].values)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=41,stratify=y)

model=GaussianNB()
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(model,X_train, X_test, y_train, y_test,loss='0-1_loss',
                                                            random_seed=41)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

IndexError: index 448 is out of bounds for axis 0 with size 195

In [106]:
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
clf = GaussianNB()
dataset=df1.values.tolist()
X = np.array(df1.drop('class',axis=1).values)
y = np.array(df1['class'].values)
scores = model_selection.cross_val_score(clf, X, y, cv=4, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

Accuracy: 0.92 (+/- 0.00)


In [111]:
if __name__ == "__main__":
    file = 'cat1.csv'
    df = pd.read_csv(file)
    class_column = df['class']
    df = df.drop(labels=['spectrometric_redshift','pred','class'],axis=1)
    df['class'] = class_column
    df4 = df.drop(labels=['extinction_u','extinction_g','extinction_r','extinction_i','extinction_z'],axis=1)
    df2 = df.drop(labels=['nuv-u','nuv-g','nuv-r','nuv-i','nuv-z','u-g','u-r','u-i','u-z','g-r','g-i','g-z','r-i','r-z','i-z'],axis=1)
    df3 = df.drop(labels=['fuv-nuv','fuv-u','fuv-g','fuv-r','fuv-i','fuv-z'],axis=1)
    df1 = df.drop(labels=['u','g','i','z','extinction_u','extinction_g','extinction_r','extinction_i','extinction_z','nuv-u','nuv-g','nuv-i','nuv-z','u-g','u-r','u-i','u-z','g-r','g-i','g-z','r-i','r-z','i-z','fuv-u','fuv-g','fuv-r','fuv-i','fuv-z'],axis=1)
    
    print("========================")
    dataset = df.values.tolist()
    Acc = main(dataset)
    print("All Columns")
    print('Accuracy \n{}'.format(Acc[0]))
    #print("seed:"+str(i))
    print("-------------------------")
    Acc = main(df1.values.tolist(random))
    print("First set of Columns")
    print('Accuracy \n{}'.format(Acc[0]))
    #print("seed:"+str(i))
    print("-------------------------")
    Acc = main(df2.values.tolist())
    print("Second set of Columns")
    print('Accuracy \n{}'.format(Acc[0]))
    #print("seed:"+str(i))
    print("-------------------------")
    Acc = main(df3.values.tolist())
    print("Second set of Columns")
    print('Accuracy \n{}'.format(Acc[0]))
    #print("seed:"+str(i))
    print("-------------------------")
    Acc = main(df4.values.tolist())
    print("Second set of Columns")
    print('Accuracy \n{}'.format(Acc[0]))
    #print("seed:"+str(i))
    print("-------------------------")






TypeError: 'list' object is not callable