In [None]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline

# load and display data
data = pd.read_csv('Case1_Data.csv')
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

display(data)

In [None]:
# split data in training / prediction based of Y value
training = data.loc[data['Y'].notnull()]
prediction = data.loc[data['Y'].isnull()]

print 'We have', len(training), 'training observations and', len(prediction), 'prediction observations'

In [None]:
train_y = training['Y'].as_matrix()
train_X = training[[col for col in training.columns if col != 'Y']].as_matrix()


In [None]:
# data preprocessing

def oneOutOfK(data, col_num, removeOriginal=True):
    '''Takes the column specified and created one 
    out of K binary columns for the values in that
    column. If removeOriginal is set to True, the 
    original column will be taken out of the data.
    Returns a numpy array.'''
    [n, p] = data.shape
    values = data[:, col_num]
    new_cols = sorted(list(set(values)))
    for col in new_cols:
        new_col = np.array([1 if row[col_num] == col else 0 for row in data])
        data = np.append(data, new_col.reshape((n, 1)), axis=1)
    if removeOriginal:
        data = np.delete(data, col_num, axis=1)
    return data
   
def missing_value_as_mean(training_data):
    data = training_data
    for i, column in enumerate(data.T[:-1,:]):
            # column = train_X[column].as_matrix()
            column_without_nan = column[~np.isnan(np.array(column, dtype=float))]
            column_mean = column_without_nan.mean()
            # print( column_without_nan.mean() )
            for predictor_index in range(len(column)):
                if pd.isnull(np.array(column[predictor_index], dtype=float)):
                    column[predictor_index] = column_mean

            data.T[i, :] = column
    return data

train_X = oneOutOfK(train_X, 99)
train_X = missing_value_as_mean(train_X)

In [None]:
import pylab 
import scipy.stats as stats

# exploring y
plt.plot(sorted(train_y), '.')
plt.ylabel('y')
plt.title('Scatterplot of y values')
plt.show()

plt.boxplot(train_y)
plt.ylabel('y')
plt.title('Boxplot of y')
plt.show()

stats.probplot(train_y, dist="norm", plot=pylab)
pylab.show()

In [None]:
# correlation plots for top correlated attributes
top = 5

corr_array = []

# calculate correlation between all vars and y
for col in train_X.T:
    corr_coef = np.corrcoef(col.astype(float), train_y)
    corr_array.append(corr_coef[0][1])

# select top correlation indeces
corr_ind = np.argpartition(np.abs(corr_array), -top)[-top:]

# select top columns correlated with y
pairs_data = train_X[:, corr_ind]

[n] = train_y.shape

# add y
pairs_data = np.append(pairs_data, train_y.reshape((n, 1)), axis=1)

[_, size] = pairs_data.shape

f, axarr = plt.subplots(size, size, figsize=(10,10))

# column header with correlation value to y (or y)
headers = [str(round(corr_array[ind], 2)) for ind in corr_ind] + ['y']

# plot 
for i in range(size ):
    for j in range(size ):

        axarr[i, j].plot(pairs_data[:,i], pairs_data[:,j], '.')
        if i==size-1:
            axarr[i, j].set_xlabel(headers[j], fontsize=10)
        else:
            axarr[i, j].set_xticks([])
        if j==0:
            axarr[i, j].set_ylabel(headers[i], fontsize=10)
        else:
            axarr[i, j].set_yticks([])

In [None]:
# basic OLS

import scipy.linalg as lng

[n, p] = np.shape(train_X)

off = np.ones(n)
M = np.c_[off, train_X.astype('float')] # Include offset / intercept

# Linear solver
beta, res, rnk, s = lng.lstsq(M, train_y.astype('float'))

beta

yhat = np.matmul(M, beta)

# Same residuals as above
res = (train_y - yhat) ** 2

rss = np.sum(res)
mse = np.mean(res)
tss = np.sum((train_y - np.mean(train_y))** 2)
r2 = (1 - rss / tss) * 100

print 'For basic OLS we get MSE =', mse

