#### Cargo las librerías necesarias 

In [20]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

#### Clase data para cargas el .csv 

In [30]:
class Data(object):

    def __init__(self, path):
        self.dataset = self.build_dataset(path)

    @staticmethod
    def build_dataset(path):
        
        structure = [('x1', float),('x2', float),('x3', float),('x4', float),
                     ('x5', float),('x6', float),('y', float)]
        
        with open(path, encoding="utf8") as data_csv:
            data_gen = ( ( float(line.split(';')[0]), float(line.split(';')[1]), float(line.split(';')[2]), 
                           float(line.split(';')[3]), float(line.split(';')[4]), float(line.split(';')[5]), 
                           float(line.split(';')[6]) ) for i, line in enumerate(data_csv) )
            data = np.fromiter(data_gen, structure)

        return data

    def split(self, porc):
        X = np.column_stack((self.dataset['x1'],self.dataset['x2'],self.dataset['x3'],
                             self.dataset['x4'],self.dataset['x5'],self.dataset['x6']))
        y = self.dataset['y']
        
        idx = np.random.permutation(X.shape[0])
        
        X_train = X[0:int(porc * X.shape[0])]
        y_train = y[0:int(porc * X.shape[0])]
        X_test  = X[int(porc * X.shape[0]):X.shape[0]]
        y_test  = y[int(porc * X.shape[0]):X.shape[0]]

        return X_train, X_test, y_train, y_test

class Imp_NaN(object):

    def __init__(self, X):
        self.X = X
        
    def nan_mice(self):
        imp = IterativeImputer(estimator = LinearRegression(), missing_values = np.nan, max_iter = 10, random_state = 0)
        return imp.fit_transform(self.X)
    
    def nan_mean(self):
        imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
        return imp.fit_transform(self.X)

class pca_a_mano(object):
    
    def __init__(self, X, n_comp = 3):
        self.X = X
        self.components = n_comp
    
    def pca(self):
        #d = self.X.T.shape[0]
        X_centered = self.X - np.mean(self.X, axis = 0)
        S = np.cov(X_centered.T)
        eigvalue, eigvector = np.linalg.eig(S)
        idx = eigvalue.argsort()[::-1]
        eigvalue = eigvalue[idx]
        eigvector = eigvector[:,idx]
        z = X_centered.dot(eigvector[:, :self.components])
        #return eigvector[:, :self.components]
        return z

In [142]:
class BaseModel(object):

    def __init__(self):
        self.model = None

    def fit(self, X, Y):
        return NotImplemented

    def predict(self, X):
        return NotImplemented

class RegLineal(BaseModel):

    def fit(self, X, y):
        #X_expanded = np.vstack((X, np.ones(len(X)))).T
        X_expanded = np.hstack((X, np.ones((len(X),1))))
        W = np.linalg.inv(X_expanded.T.dot(X_expanded)).dot(X_expanded.T).dot(y)
        self.model = W

    def predict(self, X):
        #X_expanded = np.vstack((X, np.ones(len(X)))).T
        X_expanded = np.hstack((X, np.ones((len(X),1))))
        return X_expanded.dot(self.model)

In [59]:
class Metric(object):
    def __call__(self, target, prediction):
        return NotImplemented


class MSE(Metric):
    def __init__(self):
        Metric.__init__(self)

    def __call__(self, target, prediction):
        n = target.size
        return np.sum((target - prediction) ** 2) / n

#### Cargo el archivo "clase3v2.csv"

In [4]:
path = 'clase3v2.csv'
dataset = Data(path)

#### PCA

Separo el dataset en train y test (80% y 20%, respectivamente). Imputo los NaN con dos técnicas, MICE y mean. Aplico PCA. 

In [6]:
X_train, X_test, y_train, y_test = dataset.split(0.8)

In [7]:
X_train.shape
X_train_mice=Imp_NaN(X_train).nan_mice()
X_train_mice.shape

(80, 6)

In [8]:
X_train_mean=Imp_NaN(X_train).nan_mean()
X_train_mean.shape

(80, 6)

In [38]:
pca_mice = pca_a_mano(X_train_mice).pca()
pca_mean = pca_a_mano(X_train_mean).pca()

print ("PCA mice {}".format(pca_mice.shape))
print ("PCA mean {}".format(pca_mean.shape))

PCA mice (80, 3)
PCA mean (80, 3)


In [55]:
#Chequeo contra Scikit learn (por ejemplo, para el X_train imputado por media)

X_train_mean_centered = X_train_mean - np.mean(X_train_mean, axis = 0)

pca = PCA(n_components = 3)
pca.fit(X_train_mean_centered)
pca_mean_skl=pca.transform(X_train_mean_centered)


np.testing.assert_allclose(pca_mean, pca_mean_skl)

#### Entreno con regresión lineal

In [160]:
reg_lineal = RegLineal()

#Predicción para mice
reg_lineal.fit(X_train_mice, y_train)
mice_predict = reg_lineal.predict(X_test)
#Predicción para mice
reg_lineal.fit(X_train_mean, y_train)
mean_predict = reg_lineal.predict(X_test)

In [161]:
#metrica = Metric(y_test, mice_predict)
#metrica = MSE()

In [162]:
mse_mice = np.sum((y_test - mice_predict) ** 2) / y_test.size
mse_mice

116.73927033714362

In [163]:
mse_mean = np.sum((y_test - mean_predict) ** 2) / y_test.size
mse_mean

112.90701331922223