In [48]:
import numpy as np
import pandas as pd
import math 

In [49]:
train = pd.read_csv('data/crime-train.txt',delimiter='\t')
test = pd.read_csv('data/crime-test.txt',delimiter='\t')
train.head()

Unnamed: 0,ViolentCrimesPerPop,population,householdsize,agePct12t21,agePct12t29,agePct16t24,agePct65up,numbUrban,pctUrban,medIncome,...,NumStreet,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn
0,0.67,-0.45,-1.85,-1.06,0.67,0.08,-0.85,-0.34,0.68,-0.24,...,-0.23,-0.02,-0.53,-1.08,-0.13,-0.66,-0.41,-0.56,1.26,-0.39
1,0.43,-0.45,-0.27,-0.22,-0.17,-0.34,-0.58,-0.5,-1.57,-0.29,...,-0.23,-0.33,-0.58,0.03,0.22,-0.46,-0.5,-0.11,-0.62,-0.39
2,0.12,-0.14,1.87,0.55,0.04,0.02,-1.19,-0.03,0.68,1.05,...,-0.23,-0.11,-1.51,1.07,0.07,-0.01,-0.41,0.77,0.52,-0.39
3,0.03,-0.38,0.53,-0.28,-0.79,-0.64,-0.35,-0.34,0.46,0.66,...,-0.23,-0.46,0.54,0.58,-0.08,-0.61,-0.23,-0.7,-0.62,-0.39
4,0.14,-0.3,-1.12,-0.74,-0.1,-0.4,-0.3,-0.19,0.68,0.76,...,-0.23,2.1,-0.92,-0.25,0.52,-0.06,-0.5,1.71,-0.27,-0.39


In [41]:
# store expected outcomes (1st column) into y_train
#everything else for prediction goes into X_train
X_train = train.drop('ViolentCrimesPerPop',axis=1)
y_train = train['ViolentCrimesPerPop']


print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

#convert values from string to float
X_train = np.float64(X_train)

# append 1s to the end of X_train 
ones = np.ones(len(X_train))
X_train = np.column_stack((X_train, ones))

y_train = np.float64(y_train)

X_train shape: (1595, 95)
y_train shape: (1595,)


In [50]:
X_test = test.drop('ViolentCrimesPerPop',axis=1)
y_test = test['ViolentCrimesPerPop']
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_test = np.float64(X_test)

ones = np.ones(len(X_test))
X_test = np.column_stack((X_test, ones))

y_test = np.float64(y_test)

X_test shape: (399, 95)
y_test shape: (399,)


In [43]:
def RMSE(prediction, actual):
    N = len(prediction)
    difference = prediction - actual
    total = 0
    
    for instance in difference:
        total += instance ** 2
        
    total_error = math.sqrt(total/N)
    return total_error
    

In [51]:
#perform linear regression and return list of predicted outcomes that each correspond to their actual values
def problem1(samples):
    left = np.linalg.inv(np.dot(X_train.T, X_train))
    right = np.dot(X_train.T, y_train)
    w = np.dot(left, right)
    
    prediction = []
    
    for Xtest in samples:
        prediction.append(np.dot(Xtest.T, w))
        
    return prediction


In [52]:
#make predictions based on linear regression and calculate RMSE for training and testing data
train_linear_prediction = problem1(X_train)
train_linear_RMSE = RMSE(train_linear_prediction, y_train)
print("Training Linear Regression RMSE:", train_linear_RMSE)

test_linear_prediction = problem1(X_test)
test_linear_RMSE = RMSE(test_linear_prediction, y_test)
print("Testing Linear Regression RMSE:", test_linear_RMSE)

Training Linear Regression RMSE: 0.127689674217622
Testing Linear Regression RMSE: 0.14583464490949097


In [53]:
#perform ridge regression and return list of predicted outcomes that each correspond to their actual values
def problem2(samples):
    lambda_value = 100
    left = np.linalg.inv(np.dot(X_train.T, X_train) + (lambda_value * np.identity(len(X_train.T))))
    right = np.dot(X_train.T, y_train)
    w = np.dot(left, right)
    
    prediction = []
    
    for Xtest in samples:
        prediction.append(np.dot(Xtest.T, w))
        
    return prediction

In [54]:
#make predictions based on ridge regression and calculate RMSE 
train_ridge_prediction = problem2(X_train)
train_ridge_RMSE = RMSE(train_ridge_prediction, y_train)
print("Training Ridge Regression RMSE: ", train_ridge_RMSE)

test_ridge_prediction = problem2(X_test)
test_ridge_RMSE = RMSE(test_ridge_prediction, y_test)
print("Testing Ridge Regression RMSE: ", test_ridge_RMSE)

Training Ridge Regression RMSE:  0.13134320424615784
Testing Ridge Regression RMSE:  0.14765698468526098
