### Dataset Overview
This dataset details FBI crime rates in Virginia for 2013. Specifically, the dataset includes variables such as population, violent crime, murder, rape, robbery, aggravated assault, property crime, burglary, larceny-theft, motor vehicle theft, and arson.

A link to the dataset can be found here: https://ucr.fbi.gov/crime-in-the-u.s/2013/crime-in-the-u.s.-2013/tables/table-8/table-8-state-cuts/table_8_offenses_known_to_law_enforcement_virginia_by_city_2013.xls

Dataset contains 149 observations across 12 columns

In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import ensemble
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
%matplotlib inline

### Read data in, take a look, and clean it

In [81]:
va_crime = pd.read_csv('va_crime_2013.csv', skiprows=4)

In [82]:
va_crime.head(10)

Unnamed: 0,City,Population,Violent_Crime,Murder,Rape,Robbery,Aggravated_Assault,Property_Crime,Burglary,Larceny_Theft,Motor_Vehicle_Theft,Arson,Unnamed: 12
0,Abingdon,8186,10,0.0,3.0,1.0,6.0,233,20,198,15.0,4.0,
1,Alexandria,148519,258,5.0,21.0,118.0,114.0,2967,249,2427,291.0,13.0,
2,Altavista,3486,8,0.0,0.0,2.0,6.0,56,4,52,0.0,0.0,
3,Amherst,2223,2,0.0,2.0,0.0,0.0,27,6,19,2.0,0.0,
4,Appalachia,1728,12,0.0,2.0,2.0,8.0,77,25,51,1.0,0.0,
5,Ashland,7310,26,0.0,1.0,8.0,17.0,246,14,221,11.0,1.0,
6,Bedford,5894,12,0.0,4.0,3.0,5.0,237,26,199,12.0,0.0,
7,Berryville,4290,5,0.0,2.0,1.0,2.0,80,7,72,1.0,0.0,
8,Big Stone Gap,5568,17,0.0,5.0,0.0,12.0,203,21,176,6.0,2.0,
9,Blacksburg,42603,31,0.0,7.0,4.0,20.0,523,91,417,15.0,8.0,


In [83]:
#Drop empty column
va_crime.drop(va_crime.columns[len(va_crime.columns)-1], axis=1, inplace=True)

In [84]:
#Determine missing values across dataframe
missing_values_count = va_crime.isnull().sum()
print(missing_values_count)

City                   0
Population             2
Violent_Crime          2
Murder                 2
Rape                   2
Robbery                2
Aggravated_Assault     2
Property_Crime         2
Burglary               2
Larceny_Theft          2
Motor_Vehicle_Theft    2
Arson                  2
dtype: int64


In [85]:
#Drop missing values
va_crime = va_crime.dropna()

In [86]:
#Describe the data
va_crime.describe()

Unnamed: 0,Murder,Rape,Robbery,Aggravated_Assault,Motor_Vehicle_Theft,Arson
count,149.0,149.0,149.0,149.0,149.0,149.0
mean,1.295302,6.919463,20.234899,36.194631,32.496644,3.255034
std,4.78102,18.423882,72.760008,111.625561,115.290973,11.478625
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,4.0,2.0,0.0
75%,0.0,4.0,6.0,13.0,11.0,1.0
max,37.0,140.0,624.0,842.0,938.0,99.0


In [87]:
va_crime.dtypes

City                    object
Population              object
Violent_Crime           object
Murder                 float64
Rape                   float64
Robbery                float64
Aggravated_Assault     float64
Property_Crime          object
Burglary                object
Larceny_Theft           object
Motor_Vehicle_Theft    float64
Arson                  float64
dtype: object

In [88]:
#Define function to strip comma when there is one
def remove_comma_convert_int(x):
    str(x)
    if x.find(',') != -1:
        return int(x.replace(',', ''))
    else:
        return int(x)

In [89]:
#Apply remove_comma_convert_int function to all approprirate columns

va_crime['Population'] = va_crime['Population'].apply(remove_comma_convert_int)
va_crime['Violent_Crime'] = va_crime['Violent_Crime'].apply(remove_comma_convert_int)
va_crime['Property_Crime'] = va_crime['Property_Crime'].apply(remove_comma_convert_int)
va_crime['Burglary'] = va_crime['Burglary'].apply(remove_comma_convert_int)
va_crime['Larceny_Theft'] = va_crime['Larceny_Theft'].apply(remove_comma_convert_int)

In [90]:
#Convert float columns to int

va_crime['Murder'] = va_crime['Murder'].astype(int)
va_crime['Rape'] = va_crime['Rape'].astype(int)
va_crime['Robbery'] = va_crime['Robbery'].astype(int)
va_crime['Aggravated_Assault'] = va_crime['Aggravated_Assault'].astype(int)
va_crime['Motor_Vehicle_Theft'] = va_crime['Motor_Vehicle_Theft'].astype(int)
va_crime['Arson'] = va_crime['Arson'].astype(int)

### Engineer additional features

In [None]:
#Popuation_Squared

In [None]:
#Theft - Sum or Multiplication of Robbery, Larceny_Theft, Motor_Vehicle_Theft

In [None]:
#Log of Population

In [None]:
#Establish outcome variable (convert to binary) and independent variables

### Define our model equation


### Regular Logistic Regression Model

In [None]:
# Declare a logistic regression classifier, using penalty 'l1' to indicate lasso
lr = LogisticRegression(penalty='l1')
y = df['admit']
X = df[['gpa', 'gre']]

# Fit the model.
fit = lr.fit(X, y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X)

print('\n Accuracy by admission status')
print(pd.crosstab(pred_y_sklearn, y))

print('\n Percentage accuracy')
print(lr.score(X, y))

In [None]:
#Cross-Validation



### Lasso Logistic Regression Model

In [None]:
# Declare a logistic regression classifier, using penalty 'l1' to indicate lasso
lr = LogisticRegression(penalty='l1')
y = df['admit']
X = df[['gpa', 'gre']]

# Fit the model.
fit = lr.fit(X, y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X)

print('\n Accuracy by admission status')
print(pd.crosstab(pred_y_sklearn, y))

print('\n Percentage accuracy')
print(lr.score(X, y))

In [None]:
#Cross-Validation



### Ridge Logistic Regression Model

In [None]:
# Declare a logistic regression classifier, using penalty 'l2' to indicate ridge
lr = LogisticRegression(penalty='l2')
y = df['admit']
X = df[['gpa', 'gre']]

# Fit the model.
fit = lr.fit(X, y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X)

print('\n Accuracy by admission status')
print(pd.crosstab(pred_y_sklearn, y))

print('\n Percentage accuracy')
print(lr.score(X, y))

In [None]:
#Cross-Validation



### Evaluation of All Three Models