In [4]:
import numpy as np
import pandas as pd
import sys

In [5]:

class LogisticRegression(object):
    def __init__(self, alpha=0.01, n_iteration=100):
        self.alpha = alpha
        self.n_iter = n_iteration

    def _scaling(self, X):
        '''
        avoid an overflow
        '''
        for i in range(len(X)):
            X[i] = (X[i] - X.mean())  / X.std()
        return X

    def prepare_X(self, filename, drop):
        data = pd.read_csv(filename, index_col='Index')

        # similaire result (see Histogram)
        del data['Care of Magical Creatures']
        del data['Arithmancy']
        # data identic with defense ag. (see scatter_plot)
        del data['Astronomy']

        X = data.iloc[:,5:]
        X = X.dropna()
        X = np.array(X)
        np.apply_along_axis(self._scaling, 0, X)
        return X

    def prepare_X_Y(self, filename):
        data = pd.read_csv(filename, sep=",", index_col="Index")
        data = data.dropna()

        # similaire result (see Histogram)
        del data['Care of Magical Creatures']
        del data['Arithmancy']
        # data identic with defense ag. (see scatter_plot)
        del data['Astronomy']

        X = np.array((data.iloc[:,5:]))
        y = np.array(data.loc[:, "Hogwarts House"])
        return X, y


    def set_theta(self, theta):
        self.theta = theta

    def _sigmoid_function(self, x):
        value = 1 / (1 + np.exp(-x))
        return value

#     def _gradient_descent(self, X, h, theta, y, m):
#         gradient_value = np.dot(X.T, (h - y)) / m
#         theta -= self.alpha * gradient_value
#         return theta

    def fit(self, X, y):
        np.apply_along_axis(self._scaling, 0, X)
        self.theta = []
        X = np.insert(X, 0, 1, axis=1)
        m = len(y)

        for i in np.unique(y):
            y_onevsall = np.where(y == i, 1, 0)
            theta = np.ones(X.shape[1])
            for _ in range(self.n_iter):
                z = X.dot(theta)
                h = y_onevsall - self._sigmoid_function(z)
                gradient = np.dot(X.T, h)
                theta += 5e-5 * gradient
            self.theta.append((theta, i))
        self.theta = np.array(self.theta, dtype=object)
        return self

    def score(self, X, y):
        score = sum(self.predict(X, self.theta) == y) / len(y)
        return score

    def predict(self, X, theta):
        return [ max((i.dot(t), c) for t, c in theta)[1] for i in np.insert(X, 0, 1, axis=1)]


In [6]:
def ft_train(filename):
    X, y = LogisticRegression().prepare_X_Y(filename)
    logi = LogisticRegression(n_iteration=30000).fit(X, y)

    print(logi.theta)
    np.save('theta', logi.theta)
    print(logi.score(X, y))

In [8]:
ft_train('../ressources/datasets/dataset_train.csv')

[[array([-3.65325352, -1.70747527, -0.37608521,  1.57448096, -0.01269977,
        1.65028905, -0.73605355, -1.2920553 , -0.3309326 ,  0.16327666,
        0.54037501])
  'Gryffindor']
 [array([-1.43432812,  1.42196187, -2.11260338, -0.24315756, -1.11003066,
       -1.49480757,  1.37341873,  1.07126569, -0.96413687,  1.21375993,
       -0.25260317])
  'Hufflepuff']
 [array([-1.19892584,  0.63849562,  1.70646061,  0.52720508,  0.92685034,
        1.47824487,  0.80656341, -0.31618547, -0.31059736,  2.75987342,
        0.4879294 ])
  'Ravenclaw']
 [array([-3.86756213, -0.2277265 ,  0.99404926, -2.145626  , -0.83370217,
       -0.76878509,  0.11511574,  1.14663433,  1.08878189,  1.06588092,
        0.03160621])
  'Slytherin']]
0.9840127897681854
