In [42]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from proj1_helpers import *
from helpers import *
from implementations import *
print("Importation complete")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Importation complete


## Load the training data into feature matrix, class labels, and event ids:

In [43]:
def load_data():
    DATA_TRAIN_PATH = '../data/train.csv'
    y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
    print("training data is loaded")
    return y, tX, ids
y, tX, ids = load_data()

training data is loaded


## Data Analyzing

In [44]:
# As we can see here, y only takes value -1 or 1:
for value in y:
    assert(value==1 or value==-1)
print("All value in y is equal either to 1 or -1.")


All value in y is equal either to 1 or -1.


This means that y is a binary variable. So should we modify y's domain to {0, 1} instead of {-1, 1} if we want the logistic regression methods to work?
Note that at first sight, logistic regression seems to be the best solution to fit the data since this method was designed for binary classification.
- We implemented two methods minus_one_to_zero() and zero_to_minus_one() in the helper methods section that translate y from one domain to the other.

## Data Cleaning
We have to handle:
- outliers:
A value is considered as an outlier if it does not fit in a range defined from quartiles. Outliers are replaced by the mean value of the observations.
- unasssigned values (-999, 999):
We proceed the same way
- We also standardize the data using the given standardize method. Note that this adds a row of ones in front of the data tX, whose dimension change as we can see:


In [45]:
tX = data_cleaning(tX)

shape of tX before standardizing: (250000, 30)
shape of tX before standardizing: (250000, 31)
data cleaning completed


### Using PCA to get rid of features that don't give enough information

In [33]:
#tX = PCA(tX)

Previous number of features in tX: 31
New number of features in tX: 25
PCA completed


## Some Helper Functions


In [None]:
errs = []
degrees = [4, 5, 6, 7, 8, 9, 10, 11, 12]
for degree in degrees:
    err = cross_validation_error(tX, y, degree, 1e-11, ridge_regression, k_fold=4)
    errs.append(err)
    print("fitting for degree",degree,": ",err,"%")
plt.plot(degrees, errs)

## Functions to implement

In [None]:
tX_subset = tX_tr[0:20000]
y_subset = y_tr[0:20000]
w_initial = np.zeros(tX_subset.shape[1])
w, loss = reg_logistic_regression(y_subset,tX_subset, 0, w_initial, 50, 1e-2)
print("data fitting:",error(y_te,predict_labels(w,tX_te)),"%")

## Use Polynomial Regression to find the optimal degree for least squares method

In [None]:
polynomial_regression()

### Results
Looking at the results, it seems like 3 is the optimal degree. However, we might be overfitting the data because there is no regularization step in polynomial_regression. Also the result is biased because the data is not split into training/testing subsets. Thus we'll use the Ridge regression, which uses a regularizer that depends on a parameter lambda.
We'll compute the RMSE for different lambda and degree values in order to determine the best ones.

## Use Ridge Regression to determine optimal lambda
This is a demo where we use ridge regression to deduce the loss function and the error percentage of the testing data for each (degree, lambda) pair. We iterate over different degree/lambda values to find the best ones. Recall that lambda is a coefficient penalizing the size of regression coefficients. Ridge regression introduces bias but reduces the variance of the estimate.

In [None]:

    
seed = 1
split_ratio = 0.8
w, loss = ridge_regression_demo(tX, y, split_ratio, seed)

After running a few tests with ridge regression, it seems like we should not go above degree 10.

# Use Cross-Validation to have better error

Here is a demo of the cross-validation function. We basically run cross-validation for different lamdda/degree combinations to determine which one gives the smallest error, exactly as we did above with ridge_regression_demo.
The difference is that above it was a simple 2-fold split of the data, but now we do a k-fold (here k=4) which gives us a less biased error.

In [41]:
degrees = [5]
k_fold = 4
lambdas = np.logspace(-12, -9, 6)
cross_validation_demo(tX, y)

ValueError: all the input array dimensions except for the concatenation axis must match exactly

ridge regression: best degree seems to be 5
lambda_ between 1e-12 ans 1e0
best lambda is 1.58489319246e-11

## Computing the weights with different methods

### Reloading and splitting the data

In [37]:
#y, tX, ids = load_data()
#tX = data_cleaning(tX)
tX_tr, y_tr, tX_te, y_te = split_data(tX, y, 3/4)

### Least Squares

In [6]:
w, loss = least_squares(y_tr, tX_tr)
print("Data fitting:",error(y_te,predict_labels(w,tX_te)),"%")
print("AMS:",compute_AMS(w, y_te, tX_te))

Data fitting: 77.3616 %
AMS: 698.749183579


### Least Squares with polynomial basis

In [7]:
poly_basis_tr = build_poly(tX_tr, 2)
poly_basis_te = build_poly(tX_te, 2)
w, loss = least_squares(y_tr, poly_basis_tr)
# build a polynomial basis of the same size as training set for the testing set
print("data fitting:",error(y_te,predict_labels(w,poly_basis_te)),"%")
print("AMS:",compute_AMS(w, y_te, poly_basis_te))

data fitting: 79.58239999999999 %
AMS: 730.512848485


### Least Squares - Gradient Descent

In [13]:
gamma = 1e-7
initial_w = 0*np.ones(len(tX[0])) #try changing initial w
max_iters = 20
w, loss = least_squares_GD(y_tr, tX_tr, initial_w, max_iters, gamma)
print("data fitting",error(y_te,predict_labels(w,tX_te)),"%")
print("AMS:",compute_AMS(w, y_te, tX_te))

data fitting 72.52799999999999 %
AMS: 625.342987397


### Least Squares - Stochastic Gradient Descent

In [9]:
gamma = 1e-7
initial_w = 0*np.ones(len(tX[0])) #try changing initial w
max_iters = 10
w, loss = least_squares_SGD(y_tr, tX_tr, initial_w, max_iters, gamma)
print("data fitting",error(y_te,predict_labels(w,tX_te)),"%")
print("AMS:",compute_AMS(w, y_te, tX_te))

data fitting 72.5264 %
AMS: 625.317583103


### Ridge Regression with polynomial basis

In [38]:
# to find optimal degree and lambda, check "Use RR to determine optimal
# lambda and degree" section
degree = 5
poly_basis_tr = build_poly(tX_tr, degree)
lambda_ = 1e-11
poly_basis_te = build_poly(tX_te, degree)
w, loss = ridge_regression(y_tr, poly_basis_tr, lambda_)
print("data fitting:",error(y_te,predict_labels(w,poly_basis_te)),"%")
print("AMS:",compute_AMS(w, y_te, poly_basis_te))

data fitting: 79.81439999999999 %
AMS: 733.768544131


### Logistic Regression

In [35]:
max_iter = 100
gamma = 1e-5
y01 = minus_one_to_zero(y_tr)

y_red = y01
x_red = tX_tr

w_initial = np.zeros(tX_tr.shape[1])
w,loss = logistic_regression(y_red, x_red, w_initial, max_iter, gamma)
print("data fitting:",error(y_te,predict_labels(w,tX_te)),"%")
print("AMS:",compute_AMS(w, y_te, tX_te))

Current iteration=0, the loss=129965.09635472338
Current iteration=1, the loss=100863.09323185757
Current iteration=2, the loss=94521.44296147379
Current iteration=3, the loss=91895.31485633172
Current iteration=4, the loss=90614.02620289382
Current iteration=5, the loss=89949.2744407631
Current iteration=6, the loss=89566.84211172379
Current iteration=7, the loss=89329.42488536764
Current iteration=8, the loss=89172.26421085933
Current iteration=9, the loss=89063.382442311
Current iteration=10, the loss=88985.36215163303
Current iteration=11, the loss=88927.96562278115
Current iteration=12, the loss=88884.8168015312
Current iteration=13, the loss=88851.77593771191
Current iteration=14, the loss=88826.06959855967
Current iteration=15, the loss=88805.79117295034
Current iteration=16, the loss=88789.60027293592
Current iteration=17, the loss=88776.53547130805
Current iteration=18, the loss=88765.89449382323
Current iteration=19, the loss=88757.1558078994
Current iteration=20, the loss=88

### Logistic Regression with polynomial basis

In [24]:
max_iter = 100
gamma = 1e-7
y01 = minus_one_to_zero(y_tr)

best_fitting = 0
y_red = y01[0:1000]
x_red = tX_tr[0:1000]
degrees = [1, 2, 3, 4, 5]
best_degree = degrees[0]
for degree in degrees:
    poly_basis_tr = build_poly(x_red, degree)
    initial_w = np.zeros(poly_basis_tr.shape[1])
    w,loss = logistic_regression(y_red, poly_basis_tr, initial_w, max_iter, gamma)
    poly_basis_te = build_poly(tX_te, degree)
    fitting = error(y_te,predict_labels(w,poly_basis_te))
    print("degree",degree,"-> fitting:",fitting)
    if(best_fitting < fitting):
        best_fitting = fitting
        best_degree = degree
print("best fitting:",best_fitting,"obtained with degree",best_degree)

Current iteration=0, the loss=693.1471805599322
Current iteration=1, the loss=693.1249363341697
The loss=693.1026952652824
degree 1 -> fitting: 73.56320000000001
Current iteration=0, the loss=693.1471805599322
Current iteration=1, the loss=693.035534374253
The loss=692.9241656477159
degree 2 -> fitting: 67.4624
Current iteration=0, the loss=693.1471805599322
Current iteration=1, the loss=691.2371115088397
Current iteration=2, the loss=690.569089351456
Current iteration=3, the loss=689.9891483731863
Current iteration=4, the loss=689.4507329775669
The loss=688.9387539970315
degree 3 -> fitting: 71.6928
Current iteration=0, the loss=693.1471805599322
Current iteration=1, the loss=697.6593250076747
Current iteration=2, the loss=692.3235602901568
Current iteration=3, the loss=688.1966916570093
Current iteration=4, the loss=684.8651450312649
Current iteration=5, the loss=682.227700078745
Current iteration=6, the loss=680.192520348591
Current iteration=7, the loss=678.4965446704783
Current it

  loss += np.log(1+np.exp(row.dot(w)))-y[index]*(row).dot(w)


OverflowError: cannot convert float infinity to integer

### Logistic Regression using Newton's method

In [40]:
max_iter = 100
gamma = 1e-4
y01 = minus_one_to_zero(y_tr)

y_red = y01[0:30000]
x_red = tX_tr[0:30000]

w,loss = newton_logistic_regression(y_red, x_red, max_iter, gamma)
print("data fitting:",error(y_te,predict_labels(w,tX_te)),"%")
print("AMS:",compute_AMS(w, y_te, tX_te))

Current iteration=0, the loss=20794.415416791326


KeyboardInterrupt: 

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
print("done")
tX_test = data_cleaning(tX_test)
tX_test = PCA(tX_test)

In [None]:

tX = data_cleaning(tX)
tX = PCA(tX)

In [None]:
# test ridge regression with poly basis of degree 5
# This is the optimal solution with cleaning and no PCA
lambda_ = 1e-11
degree = 5
#tX_test,_,_ = standardize(tX_test)
poly_basis_te = build_poly(tX_test, degree)
poly_basis_tr = build_poly(tX, degree)
w, loss = ridge_regression(y, poly_basis_tr, lambda_)
print("done")

In [None]:
# test least squares
degree = 2
tX_test,_,_=standardize(tX_test)
poly_basis_te = build_poly(tX_test, degree)
poly_basis_tr = build_poly(tX, degree)
w, loss = least_squares(y, poly_basis_tr)

In [None]:
# test ridge regression with poly_basis of degree 2
lambda_ = 0.00138949549437
degree = 2
tX_test,_,_ = standardize(tX_test)
poly_basis_te = build_poly(tX_test, degree)
poly_basis_tr = build_poly(tX, degree)
w, loss = ridge_regression(y, poly_basis_tr, lambda_)
print("done")

In [None]:
# test ridge regression no data cleaning
lambda_ = 0.000268269579528 
w, loss = ridge_regression(y, tX, lambda_)
#print(error(y_te,predict_labels(w,tX_te_poly)))

In [None]:
print(w.shape)
print(tX_test.shape)

In [None]:
OUTPUT_PATH = '../data/submissionData/RR_deg5_withPCA.csv' # TODO: fill in desired name of output file for submission

#o = np.ones((tX_test.shape[0],1))

y_pred = predict_labels(w, poly_basis_te)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)
print("done")

In [None]:
print(y_pred.shape)
print(tX_test.shape)