In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
crime = pd.read_table('CommViolPredUnnormalizedData.txt',sep = ',', na_values='?')

In [3]:
crime.head()

Unnamed: 0,communityname,state,countyCode,communityCode,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,burglaries,burglPerPop,larcenies,larcPerPop,autoTheft,autoTheftPerPop,arsons,arsonsPerPop,ViolentCrimesPerPop,nonViolPerPop
0,BerkeleyHeightstownship,NJ,39.0,5320.0,1,11980,3.1,1.37,91.78,6.5,...,14.0,114.85,138.0,1132.08,16.0,131.26,2.0,16.41,41.02,1394.59
1,Marpletownship,PA,45.0,47616.0,1,23123,2.82,0.8,95.57,3.44,...,57.0,242.37,376.0,1598.78,26.0,110.55,1.0,4.25,127.56,1955.95
2,Tigardcity,OR,,,1,29344,2.43,0.74,94.33,3.43,...,274.0,758.14,1797.0,4972.19,136.0,376.3,22.0,60.87,218.59,6167.51
3,Gloversvillecity,NY,35.0,29443.0,1,16656,2.4,1.7,97.35,0.5,...,225.0,1301.78,716.0,4142.56,47.0,271.93,,,306.64,
4,Bemidjicity,MN,7.0,5068.0,1,11245,2.76,0.53,89.16,1.17,...,91.0,728.93,1060.0,8490.87,91.0,728.93,5.0,40.05,,9988.79


In [4]:
columns_to_keep = [5,6] + list(range(11,26)) + list(range(32,103)) + [145]
crime = crime.iloc[:, columns_to_keep].dropna()

In [5]:
crime.head()

Unnamed: 0,population,householdsize,agePct12t21,agePct12t29,agePct16t24,agePct65up,numbUrban,pctUrban,medIncome,pctWWage,...,MedOwnCostPctInc,MedOwnCostPctIncNoMtg,NumInShelters,NumStreet,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,ViolentCrimesPerPop
0,11980,3.1,12.47,21.44,10.93,11.33,11980,100.0,75122,89.24,...,21.1,14.0,11,0,10.66,53.72,65.29,78.09,89.14,41.02
1,23123,2.82,11.01,21.3,10.48,17.18,23123,100.0,47917,78.99,...,20.7,12.5,0,0,8.3,77.17,71.27,90.22,96.12,127.56
2,29344,2.43,11.36,25.88,11.01,10.28,29344,100.0,35669,82.0,...,21.7,11.6,16,0,5.0,44.77,36.6,61.26,82.85,218.59
3,16656,2.4,12.55,25.2,12.19,17.57,0,0.0,20580,68.15,...,20.6,14.5,0,0,2.04,88.71,56.7,90.17,96.24,306.64
5,140494,2.45,18.09,32.89,20.04,13.26,140494,100.0,21577,75.78,...,17.3,11.7,327,4,1.49,64.35,42.29,70.61,85.66,442.95


In [6]:
X_crime = crime.iloc[:,range(0,88)]
y_crime = crime['ViolentCrimesPerPop']

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler

In [9]:
X_train,X_test, y_train,y_test = train_test_split(X_crime,y_crime,random_state=0)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
linlasso = Lasso(alpha=2.0, max_iter = 10000).fit(X_train_scaled, y_train)
print('crime dataset')
print('lasso regression linear model intercept: {}'
       .format(linlasso.intercept_))
print('lasso regression linear model coeff:\n{}'
      .format(linlasso.coef_))
print('Non- zero features:{}'
      .format(np.sum(linlasso.coef_!=0)))
print('R-squared score (training): {:.3f}'
     .format(linlasso.score(X_train_scaled,y_train)))
print('R-squared score(test): {:.3f}'
     .format(linlasso.score(X_test_scaled,y_test)))
print('Features with non-zero weight (sorted by absolute magnitude):')

for e in sorted(list(zip(list(X_crime),linlasso.coef_)),
            key = lambda e: -abs(e[1])):
     if e[1] != 0:
        print('\t{},{:.3f}'.format(e[0],e[1]))

crime dataset
lasso regression linear model intercept: 1186.612061998579
lasso regression linear model coeff:
[    0.             0.            -0.          -168.18346054
    -0.            -0.             0.           119.6938194
     0.            -0.             0.          -169.67564456
    -0.             0.            -0.             0.
     0.             0.            -0.            -0.
     0.            -0.             0.             0.
   -57.52991966    -0.            -0.             0.
   259.32889226    -0.             0.             0.
     0.            -0.         -1188.7396867     -0.
    -0.            -0.          -231.42347299     0.
  1488.36512229     0.            -0.            -0.
    -0.             0.             0.             0.
     0.             0.            -0.             0.
    20.14419415     0.             0.             0.
     0.             0.           339.04468804     0.
     0.           459.53799903    -0.             0.
   122.69221826    

In [21]:
# Lasso regression with regularization parameter

In [24]:
print('Lasso regression: effect of alpha regularization\n\
parameter on number of features kept in final model\n')

for alpha in [0.5, 1, 2, 3, 5, 10, 20, 50]:
    linlasso = Lasso(alpha, max_iter = 10000).fit(X_train_scaled, y_train)
    r2_train = linlasso.score(X_train_scaled,y_train)
    r2_test = linlasso.score(X_test_scaled, y_test)
    
    print('Alpha = {:.2f}\n\
    Features kept:{}, r-squared training: {:.2f},\
    r-squared test: {:.2f}\n'
         .format(alpha,np.sum(linlasso.coef_ != 0),r2_train,r2_test))

Lasso regression: effect of alpha regularization
parameter on number of features kept in final model

Alpha = 0.50
    Features kept:35, r-squared training: 0.65,    r-squared test: 0.58

Alpha = 1.00
    Features kept:25, r-squared training: 0.64,    r-squared test: 0.60

Alpha = 2.00
    Features kept:20, r-squared training: 0.63,    r-squared test: 0.62

Alpha = 3.00
    Features kept:17, r-squared training: 0.62,    r-squared test: 0.63

Alpha = 5.00
    Features kept:12, r-squared training: 0.60,    r-squared test: 0.61

Alpha = 10.00
    Features kept:6, r-squared training: 0.57,    r-squared test: 0.58

Alpha = 20.00
    Features kept:2, r-squared training: 0.51,    r-squared test: 0.50

Alpha = 50.00
    Features kept:1, r-squared training: 0.31,    r-squared test: 0.30



In [4]:
#Lassso Regression with feature Normalization

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

crime = pd.read_table('CommViolPredUnnormalizedData.txt',sep=',',na_values = '?')
columns_to_keep = [5,6] + list(range(11,26)) + list(range(32,103)) +[145]
crime = crime.iloc[:,columns_to_keep].dropna()
X_crime = crime.iloc[:,range(0,88)]
y_crime = crime['ViolentCrimesPerPop']

X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,random_state=0)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linlasso = Lasso(alpha = 2.0, max_iter = 10000).fit(X_train_scaled,y_train)

print('lasso regression linear model intercept: {}'.format(linlasso.intercept_))
print('lasso regression linear model coeff: {}'.format(linlasso.coef_))
print('R-Squared Score (training) :{:.3f}'.format(linlasso.score(X_train_scaled,y_train)))
print('R-Squared score (test) :{:.3f}'.format(linlasso.score(X_test_scaled,y_test)))


lasso regression linear model intercept: 1186.612061998579
lasso regression linear model coeff: [    0.             0.            -0.          -168.18346054
    -0.            -0.             0.           119.6938194
     0.            -0.             0.          -169.67564456
    -0.             0.            -0.             0.
     0.             0.            -0.            -0.
     0.            -0.             0.             0.
   -57.52991966    -0.            -0.             0.
   259.32889226    -0.             0.             0.
     0.            -0.         -1188.7396867     -0.
    -0.            -0.          -231.42347299     0.
  1488.36512229     0.            -0.            -0.
    -0.             0.             0.             0.
     0.             0.            -0.             0.
    20.14419415     0.             0.             0.
     0.             0.           339.04468804     0.
     0.           459.53799903    -0.             0.
   122.69221826    -0.           