## 1. Train a classifier for each stock independantly 

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing, metrics
from sklearn import cross_validation
from sklearn import linear_model
%matplotlib inline

In [10]:
def create_data_matrix(matrix,k):
    """ k : integer """
    p=matrix.shape[1]
    n=matrix.shape[0]
    data_matrix=np.zeros((n-k,p*k))
    for i in range(k):
        for j in range(n-k):
            data_matrix[j,i*p:(i+1)*p]=matrix[j+i,:]
    return data_matrix



In [11]:
def create_ratio_matrix(matrix):
    """ k : integer """
    p=matrix.shape[1]
    n=matrix.shape[0]
    data_matrix=np.zeros((n-1,p))
    for i in range(n-1):
        data_matrix[i,:]=matrix[i,:]/matrix[i+1,:] #Xi -> jour i et Xi+1 -> jour i-1
        
    data_matrix=data_matrix[1:n-1,:]
    return data_matrix

In [12]:
def create_zero_one(y):
    y=np.array(y)
    n=y.shape[0]
    y_clf=np.zeros(n-2)
    for i in range(n-2):
        if y[i]>=y[i+1]:
            y_clf[i]=1
    return y_clf

In [13]:
# Cross-validation procedure, with standardization
from sklearn import preprocessing
def cross_validate_regr_with_scaling(design_matrix, labels, regressor, cv_folds):
    """ Perform a cross-validation and returns the predictions. 
    Use a scaler to scale the features to mean 0, standard deviation 1.
    
    Parameters:
    -----------
    design_matrix: (n_samples, n_features) np.array
        Design matrix for the experiment.
    labels: (n_samples, ) np.array
        Vector of labels.
    classifier:  Regressor instance; must have the following methods:
        - fit(X, y) to train the regressor on the data X, y
        - predict_proba(X) to apply the trained regressor to the data X and return predicted values
    cv_folds: sklearn cross-validation object
        Cross-validation iterator.
        
    Return:
    -------
    pred: (n_samples, ) np.array
        Vectors of predictions (same order as labels).
    """
    
    pred = np.zeros(labels.shape)
    for tr, te in cv_folds:
        scaler = preprocessing.StandardScaler()
        Xtr = scaler.fit_transform(design_matrix[tr,:])
        ytr =  labels[tr]
        Xte = scaler.transform(design_matrix[te,:])
        regressor.fit(Xtr, ytr)
        pred[te] = (regressor.predict(Xte))
    return pred

In [14]:
stock_names=["AC","ACA","AI","AIR","ATO","BN","BNP","CA","CAP","DG","EI","EN","ENGI","FP","FR","GLE","KER","LHN","LR","MC","ML","OR","ORA","SAN","SGO","SU","SW","UG","VIE","VIV"]

In [15]:
len(stock_names)

30

In [19]:
X_clf_list=[]
y_clf_list=[]
stock_dict={}

for stock in stock_names:
    historic_price = pd.read_csv("/Users/serrano/Documents/cours_centrale/projet_inno/git/projets8/scripts_Python/data/historic_price/"+stock+".PA.csv", sep=",")
    X_clf = np.array(historic_price.drop(['Date','Close','Adj close'], axis=1).values)
    y_clf = np.array(historic_price['Close'].values)
    #transform the data
    
    X_clf_1=create_ratio_matrix(X_clf)
    y_new=create_zero_one(y_clf)    
    
    
    #add it to the dictionnary
    X_clf_list.append(X_clf_1)
    y_clf_list.append(y_new)
    stock_dict[stock]={'data':X_clf_1,'label':y_new,'clf':'','accuracy':'','best_param':''}



print(y_new)

[1. 0. 0. ... 1. 0. 1.]


In [20]:
stock_dict['BNP']['clf']


''

In [21]:
#for each stock select the best parametor for the l2 regularized logistic regression
from sklearn import model_selection

cvalues_list = np.logspace(-5, 10, 50)
classifier = linear_model.LogisticRegression(penalty='l2')
param_grid = {'C': cvalues_list}

#gridsearch to find the best parametor
for stock in stock_names:
    clf_logreg_l2_s_opt = model_selection.GridSearchCV(classifier, 
                                                   param_grid, 
                                                   cv=3)
    X_clf_1=stock_dict[stock]['data']
    y_new=stock_dict[stock]['label']
    Xtr, Xte, ytr, yte = model_selection.train_test_split(X_clf_1, y_new, 
                                                      test_size=0.3)
    scaler = preprocessing.StandardScaler()
    Xtr_scaled = scaler.fit_transform(Xtr)
    Xte_scaled = scaler.transform(Xte)
    clf_logreg_l2_s_opt.fit(Xtr_scaled,ytr)
    
    #keep in memory the best C_value
    stock_dict[stock]['best_param']= clf_logreg_l2_s_opt.best_estimator_.C                                
    

In [22]:
print(stock_dict)

{'AC': {'data': array([[1.00044823, 1.00223314, 0.99185152, 0.81515882],
       [0.98130636, 0.97837011, 0.99013895, 1.18374539],
       [0.97637964, 0.98282156, 0.97700898, 1.36218281],
       ...,
       [1.01845018, 1.00795085, 1.00848708, 0.84524999],
       [1.00258972, 1.01318198, 1.00855973, 1.02474626],
       [0.99265516, 0.99671533, 1.01357978, 0.88673413]]), 'label': array([0., 0., 0., ..., 0., 0., 1.]), 'clf': '', 'accuracy': '', 'best_param': 0.7906043210907702}, 'ACA': {'data': array([[0.99622356, 1.        , 0.9992343 , 0.75480278],
       [0.98074074, 0.97209985, 0.99014405, 0.7602888 ],
       [0.98684211, 0.99343545, 0.97921307, 1.87082462],
       ...,
       [1.01244168, 0.99390244, 1.00316456, 0.85571415],
       [1.02388535, 1.00923077, 1.00797448, 0.64524262],
       [1.01618123, 1.02201258, 1.05378151, 0.67229941]]), 'label': array([1., 0., 0., ..., 1., 0., 1.]), 'clf': '', 'accuracy': '', 'best_param': 0.011513953993264481}, 'AI': {'data': array([[1.00810209, 1

In [23]:
from sklearn import metrics
for stock in stock_names:
    stock_dict[stock]['clf']=linear_model.LogisticRegression(penalty='l2',C=stock_dict[stock]['best_param'])
    
    clf=stock_dict[stock]['clf']
    y_new=stock_dict[stock]['label']
    X_clf=stock_dict[stock]['data']
    folds_clf = cross_validation.StratifiedKFold(y_new, n_folds=10, shuffle=True)
    
    y_pred=cross_validate_regr_with_scaling(X_clf,y_new,clf,folds_clf)
    
    y_pred=np.where(y_pred>0.5,1,0)
    stock_dict[stock]['accuracy']=metrics.accuracy_score(y_new, y_pred)

In [24]:
for stock in stock_names:
    print(stock_dict[stock]['accuracy'])

0.48589341692789967
0.5117554858934169
0.5321316614420063
0.4811083123425693
0.5410094637223974
0.5250783699059561
0.5054858934169278
0.48040752351097177
0.5109717868338558
0.5117554858934169
0.5297805642633229
0.49764890282131663
0.4882943143812709
0.5188087774294671
0.5308285163776493
0.4898119122257053
0.5051221434200157
0.49279538904899134
0.5
0.518025078369906
0.5023510971786834
0.5352664576802508
0.5219435736677116
0.5786561264822134
0.4842271293375394
0.5062695924764891
0.5188087774294671
0.5054858934169278
0.5376175548589341
0.5070532915360502
