In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def prepare_X_Y():
    filename = '../ressources/datasets/dataset_train.csv'

    data = pd.read_csv(filename, sep=",", index_col="Index")
    data = data.dropna()

    # similaire result (see Histogram)
    del data['Care of Magical Creatures']
    del data['Arithmancy']
    # data identic with defense ag. (see scatter_plot)
    del data['Astronomy']

    X = np.array((data.iloc[:,5:]))
    y = np.array(data.loc[:, "Hogwarts House"])
    return X, y

In [9]:
class LogisticRegression():
    def __init__(Logreg, alpha=0.01, n_iteration=100):
        Logreg.alpha = alpha
        Logreg.n_iter = n_iteration

    def _scaling(Logreg, X):
        '''
        avoid an overflow
        '''
        for i in range(len(X)):
            X[i] = (X[i] - X.mean()) / X.std()
        return X
        
    def _sigmoid_function(Logreg, x):
        value = 1 / (1 + np.exp(-x))
        return value
    
    def _hypothesis(Logreg, theta, X):
        return 1 / (1 + np.exp(-(np.dot(theta, X.T)))) - 0.0000001
    
    def _cost_function(Logreg, h, theta, y):
        m = len(y)
        i = (1/m)
        i = (np.sum(-y.T.dot(np.log(h))))
        i = np.log(1 - h)
        i = ((1 - y).T.dot(np.log(1 - h)))
        cost = (1 / m) * (np.sum(-y.T.dot(np.log(h)) - (1 - y).T.dot(np.log(1 - h))))
        return cost
    
    def _gradient_descent(Logreg, X, h, theta, y, m):
        '''
        theta = theta - alpha * sigma(h^i - y^i)(X^ij)
        '''
        gradient_value = np.dot(X.T, (h - y)) / m
        theta -= Logreg.alpha * gradient_value
        return theta
    
    def fit(Logreg, X, y):
        print('Fitting the given dataset...')
        Logreg.theta = []
#         Logreg.cost = []
        np.apply_along_axis(Logreg._scaling, 0, X)
        X = np.insert(X, 0, 1, axis=1)
        m = len(y)
        for i in np.unique(y):
            y_onevsall = np.where(y == i, 1, 0)
            theta = np.ones(X.shape[1])
#             cost = []
            for _ in range(Logreg.n_iter):
                z = X.dot(theta)
                h = Logreg._sigmoid_function(z)
                theta = Logreg._gradient_descent(X, h, theta, y_onevsall, m)
#                 cost.append(Logreg._cost_function(h, theta, y_onevsall))
            Logreg.theta.append((theta, i))
#             Logreg.cost.append((cost, i))
        return Logreg
    
    def predict(Logreg, X):
        X = np.insert(X, 0, 1, axis=1)
        X_predicted = [max((Logreg._sigmoid_function(i.dot(theta)), c) for theta, c in Logreg.theta)[1] for i in X]
        return X_predicted
    
    def score(Logreg, X, y):
        score = sum(Logreg.predict(X) == y) / len(y)
        return score
    
    def _plot_cost(Logreg, costh):
        for cost, c in costh:
            plt.plot(range(len(cost)), cost, 'r')
            plt.title('Convergence Graph of Cost Function of type-' + str(c) + ' vs All')
            plt.xlabel('Number of Iterations')
            plt.ylabel('Cost')
            plt.show()

In [10]:
X, y = prepare_X_Y()

In [11]:
logi = LogisticRegression(alpha=0.01,n_iteration=30000).fit(X, y)

Fitting the given dataset...


In [12]:
logi.score(X, y)

0.9840127897681854

In [None]:
print(y)