In [167]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from collections import Counter

In [168]:
fulldata=pd.read_csv('wine-quality/data.csv', delimiter = ';')

data,testdata=np.split(fulldata,[int(.8*len(fulldata))])

outputdata=data[data.columns[-1]]
inputdata = data[data.columns[0:-1]]

In [169]:
mean=np.mean(inputdata)
stdev=np.std(inputdata)
for i in range(len(inputdata.columns)):
    inputdata[inputdata.columns[i]]= [(1.0 * (c-mean[i]))/stdev[i] for c in inputdata[inputdata.columns[i]]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [170]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [171]:
def weightedinput(theta, x):
    return np.dot(x, theta)

In [172]:
def hx(theta, x):
    return sigmoid(weightedinput(theta, x))

In [173]:
def cost_function(theta, x, y): 
    m = x.shape[0]
    total_cost = -(1 / m) * np.sum( y * np.log(hx(theta, x)) + (1 - y) * np.log(1 - hx(theta, x)))
    return total_cost


In [174]:
def gradient(theta, x, y, alpha,iterations):
    # Computes the gradient of the cost function at the point theta
    cost_history = [0] * iterations
    m = x.shape[0]
    for iteration in range(iterations):
        h= hx(theta,x)
        loss = h-y
        grd = np.dot(X.T,loss) / m
        theta = theta - alpha * grd

        cost = cost_function(theta,x, y)
        cost_history[iteration] = cost
    
    return theta,cost_history

# One vs All

In [175]:
X = np.c_[np.ones((inputdata.shape[0], 1)), inputdata]
allthetas = {}
uniquelabels = outputdata.unique()
i=0;
for label in uniquelabels:
    theta = np.zeros(X.shape[1])
    Y=np.array(outputdata)
    print(label)
    for j in range(len(Y)):
        if Y[j]==label:
            Y[j]=1
        else:
            Y[j]=0
    cost=cost_function(theta, X, Y)

    theta,cost_history = gradient(theta,X,Y,0.009, 10000)
    allthetas[label] = theta
#     print(theta)    

7
6
5
4
8
3
9


In [176]:
allthetas

{7: array([-1.9041058 ,  0.15921054, -0.39709352, -0.13887166,  0.41199906,
        -0.42518872,  0.12000668, -0.04215457, -0.35771207,  0.20854392,
         0.16455905,  0.74074963]),
 6: array([-0.21961053, -0.10943575, -0.34941233,  0.04094271,  0.08553345,
         0.03959342,  0.0372674 , -0.05923384,  0.05538184, -0.04241453,
         0.02608542,  0.18728942]),
 5: array([-1.11342389, -0.00871503,  0.37837607,  0.01420935, -0.41561866,
        -0.0189557 , -0.15278823,  0.20511822,  0.25252564, -0.09372385,
        -0.17184598, -1.01722261]),
 4: array([-3.65724219,  0.2772723 ,  0.54512344, -0.09647156, -0.38376353,
         0.03002259, -0.66084429, -0.13799687,  0.23252718,  0.10604855,
        -0.0098103 , -0.3593969 ]),
 8: array([-3.46373321, -0.03607751, -0.22107701,  0.00908872,  0.32213828,
        -0.11873607,  0.25297098, -0.1045496 , -0.22650015,  0.08011724,
        -0.03425756,  0.678617  ]),
 3: array([-4.29904862e+00,  1.90205949e-01,  7.27868305e-02, -4.68105108e-

In [177]:
def predict(row, B):
    row = np.insert(row, 0, 1, axis=0)  
    return sigmoid(np.dot(row,B))

In [186]:
def validate():
    global correct
    global wrong
    global tp
    global fp
    global fn
    global tn
    global validatevector
    global predicted
    correct=0
    wrong=0
    tp=0
    tn=0
    fp=0
    fn=0
    validatedata = []
    for i in range(len(testdata.columns)-1):
        col = testdata[testdata.columns[i]]
        normcol = [1.0 * (c-mean[i])/stdev[i] for c in col]
        validatedata.append(normcol)
    
    validatevector=np.array(validatedata).T
    actual = np.array(testdata[testdata.keys()[-1]])
    
    predicted=[]
    y={}
    for i in range(len(validatevector)):
        row=validatevector[i]
        maxv=0
        for label in allthetas:
            y[label] = predict(row,allthetas[label])
            if(y[label]>maxv):
                maxv=y[label]
                maxlabel=label
        predicted.append(maxlabel)
    print(predicted)

    for i in range(len(actual)):
        if actual[i]==predicted[i]:
            if(predicted[i]==1):
                tp+=1;
            else:
                tn+=1;
            correct+=1 
        else:
            if(predicted[i]==0):
                fn+=1;
            else:
                fp+=1;
            wrong+=1
    accuracy= (correct/(correct+wrong))
    return accuracy

In [187]:
accuracy= validate()

[6, 7, 6, 6, 6, 6, 5, 6, 6, 7, 7, 6, 6, 6, 6, 6, 6, 7, 5, 5, 5, 6, 6, 6, 6, 6, 5, 6, 6, 6, 7, 5, 6, 5, 5, 6, 7, 7, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 6, 5, 6, 6, 6, 7, 5, 6, 6, 6, 5, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 7, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 5, 7, 5, 6, 5, 7, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 5, 5, 6, 5, 5, 7, 6, 6, 6, 6, 5, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 7, 5, 5, 6, 6, 6, 6, 6, 5, 7, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 5, 5, 7, 6, 6, 6, 7, 6, 6, 6, 6, 5, 6, 6, 5, 6, 7, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 5, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 7, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 7, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 5, 6, 5, 6, 6, 7, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 

In [188]:
print(accuracy)

0.5147392290249433


# One vs one

In [189]:
onevsonetheta = {}
length = len(uniquelabels)
for i in range(length):
    for j in range(length):
        if i<j:
            first = uniquelabels[i]
            second = uniquelabels[j]
            newdata= data[data['quality']==first]
            newdata2 =data[data['quality']==second]
            newdata.append(newdata2)
            op=newdata[newdata.columns[-1]]
            ip=newdata[newdata.columns[0:-1]]
            mean=np.mean(ip)
            stdev=np.std(ip)
            for k in range(len(ip.columns)):
                ip[ip.columns[k]]= [(1.0 * (c-mean[k]))/stdev[k] for c in ip[ip.columns[k]]]
            X = np.c_[np.ones((ip.shape[0], 1)), ip]
            Y = Y=np.array(op)
            for l in range(len(Y)):
                if Y[l]==first:
                    Y[l]=0
                else:
                    Y[l]=1
            theta = np.zeros(X.shape[1])
            theta,cost_history = gradient(theta,X,Y,0.009, 10000)
            onevsonetheta[(first, second)]=theta      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [198]:
def validateonevsone(inputdata, testdata):
    global correct
    global wrong
    correct=0
    wrong=0
    validatedata = []
    for i in range(len(testdata.columns)-1):
        col = testdata[testdata.columns[i]]
        normcol = [1.0 * (c-mean[i])/stdev[i] for c in col]
        validatedata.append(normcol)
    
    validatetrans=np.array(validatedata).T
    actual = np.array(testdata[testdata.keys()[-1]])
    
    predicts=[]
    for i in range(len(validatetrans)):
        row=validatetrans[i]
        y=[]
        for key in onevsonetheta:
            if predict(row,onevsonetheta[key])<0.5:
                y.append(key[0])
            else:
                y.append(key[1])

        c= Counter(y)
        values, count =c.most_common()[0]
#         print(values)
        predicts.append(values)


    for i in range(len(actual)):
        if actual[i]==predicted[i]:
            correct+=1 
        else:
            wrong+=1
            
    accuracy= (correct/(correct+wrong))
    return accuracy

In [199]:
 validateonevsone(inputdata, testdata)

0.5201793721973094