In [10]:
import numpy as np
import pandas as pd
import math
from math import sqrt

In [11]:
train_data = pd.read_csv('data/crime-train.txt',delimiter='\t')
test_data = pd.read_csv('data/crime-test.txt',delimiter='\t')
train_data.head()

Unnamed: 0,ViolentCrimesPerPop,population,householdsize,agePct12t21,agePct12t29,agePct16t24,agePct65up,numbUrban,pctUrban,medIncome,...,NumStreet,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn
0,0.67,-0.45,-1.85,-1.06,0.67,0.08,-0.85,-0.34,0.68,-0.24,...,-0.23,-0.02,-0.53,-1.08,-0.13,-0.66,-0.41,-0.56,1.26,-0.39
1,0.43,-0.45,-0.27,-0.22,-0.17,-0.34,-0.58,-0.5,-1.57,-0.29,...,-0.23,-0.33,-0.58,0.03,0.22,-0.46,-0.5,-0.11,-0.62,-0.39
2,0.12,-0.14,1.87,0.55,0.04,0.02,-1.19,-0.03,0.68,1.05,...,-0.23,-0.11,-1.51,1.07,0.07,-0.01,-0.41,0.77,0.52,-0.39
3,0.03,-0.38,0.53,-0.28,-0.79,-0.64,-0.35,-0.34,0.46,0.66,...,-0.23,-0.46,0.54,0.58,-0.08,-0.61,-0.23,-0.7,-0.62,-0.39
4,0.14,-0.3,-1.12,-0.74,-0.1,-0.4,-0.3,-0.19,0.68,0.76,...,-0.23,2.1,-0.92,-0.25,0.52,-0.06,-0.5,1.71,-0.27,-0.39


In [12]:
X_train = train_data.drop('ViolentCrimesPerPop',axis=1)
y_train = train_data['ViolentCrimesPerPop']
print (X_train.shape)
print (y_train.shape)

(1595, 95)
(1595,)


In [4]:
X_test = test_data.drop('ViolentCrimesPerPop',axis=1)
y_test = test_data['ViolentCrimesPerPop']
print (X_test.shape)
print (y_test.shape)

(399, 95)
(399,)


In [13]:
'''def rmse(prediction, actual):
    N = len(prediction)
    difference = prediction - actual
    total = 0
    
    for instance in difference:
        total += instance ** 2
        
    total_error = math.sqrt(total/N)
    return total_error'''

def rmse(y_true,y_pred):
    y_true = y_true.reshape(y_true.shape[0],1)    
    y_pred = y_pred.reshape(y_pred.shape[0],1)
    
    return (math.sqrt(np.dot((y_true-y_pred).T,
                        (y_true-y_pred))/
                 y_true.shape[0]))[0][0]

In [6]:
#RIDGE REGRESSION CLOSED FORM

class RR_model:
    
    def __init__(self,lbda):
        self.lbda = lbda
        
    def fit(self,X_train,y_train):
        X = X_train.copy()
        X['bias'] = np.ones(X.shape[0])
        
        self.w = np.dot(np.dot(np.linalg.inv(np.dot(X.T,X)+lbda*np.identity(X.shape[1])),X.T),y_train)
        return self
    
    def predict(self,X_test):
        X = X_test.copy()
        X['bias'] = np.ones(X.shape[0])
        
        return np.dot(self.w,X.T)


In [7]:
def cross_val(model,X,y,k):
    total = 0
    for ii in range(k):
        X_trn = pd.concat([X.iloc[:ii*len(X)/k], X.iloc[(ii+1)*len(X)/k:]])
        y_trn = pd.concat([y.iloc[:ii*len(y)/k], y.iloc[(ii+1)*len(y)/k:]])
        
        X_tst = X.iloc[ii*len(X)/k:(ii+1)*len(X)/k].copy()
        y_tst = y.iloc[ii*len(y)/k:(ii+1)*len(y)/k].copy()
        
        m = model.fit(X_trn,y_trn)
        y_tst_hat = m.predict(X_tst)
        
        total += rmse(y_tst.values,y_tst_hat)
    return total/k

In [8]:
lbda = 400 #initial lambda to iterate down from
k = 5 #for k-fold cross validation

#computing test RSME with the lambda from above function
rr = RR_model(25)
rr = rr.fit(X_train,y_train)
y_test_hat = rr.predict(X_test)

print (rmse(y_test.values, y_test_hat))

TypeError: 'float' object is not subscriptable

In [None]:
#Linear regression using gradient descent algorithm:
class OLS_model_gd:
    
    def fit(self,X_train,y_train,alpha):
        X = X_train.copy()
        X['bias'] = np.ones(X.shape[0])
        
        np.random.seed(42)
        w = np.random.normal(0,1,size=X.shape[1])
        
        epsilon = 1
        while epsilon > 10e-7:
            w_new = w - alpha/float(X.shape[0])*np.dot((np.dot(X,w) - y_train),X)
            epsilon = np.linalg.norm(w_new - w) #euclidian distance
            w = w_new
            
        self.w = w
        return self
    
    def predict(self,X_test):
        X = X_test.copy()
        X['bias'] = np.ones(X.shape[0])
        
        return np.dot(self.w,X.T)

In [None]:
#evaluating on training data:

lr = OLS_model_gd()
lr = lr.fit(X_train,y_train,0.001)
y_train_hat = lr.predict(X_train)

print ('Training data RMSE:', rmse(y_train.values,y_train_hat))

In [None]:
#evaluating on testing data:
y_test_hat = lr.predict(X_test)

print ('Testing data RMSE:', rmse(y_test.values,y_test_hat))

In [None]:
#Ridge regressing using the gradient descent algorithm:
class RR_model_gd:
    
    def __init__(self,lbda):
        self.lbda = lbda
        
    def fit(self,X_train,y_train,alpha):
        X = X_train.copy()
        X['bias'] = np.ones(X.shape[0])
        
        np.random.seed(42)
        w = np.random.normal(0,1,size=X.shape[1])
        
        epsilon = 1
        while epsilon > 10e-7:
            w_new = w - alpha/float(X.shape[0])*(np.dot((np.dot(X,w) - y_train),X) +
                                                 lbda*np.dot(np.identity(X.shape[1]),w))
            epsilon = np.linalg.norm(w_new - w) #euclidian distance
            w = w_new
            
        self.w = w
        return self
    
    def predict(self,X_test):
        X = X_test.copy()
        X['bias'] = np.ones(X.shape[0])
        
        return np.dot(self.w,X.T)

In [None]:
#evaluate on training data

rr = RR_model_gd(lbda=25)
rr = rr.fit(X_train,y_train,0.001)
y_train_hat = rr.predict(X_train)

print ('Training data RMSE:', rmse(y_train.values,y_train_hat))

In [None]:
#evaluate on testing data
y_test_hat = rr.predict(X_test)

print ('Test RMSE:', rmse(y_test.values,y_test_hat))