# CS 584 Machine Learning


# Homework Project: Crime Prediction


---

#### Importing all libraries

In [1]:
import pandas as pd
import numpy as np
import heapq
import sys
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold
from sklearn import model_selection
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

####  Adds a highCrime column to the input data set and prints the percentage of highCrime and low crime.

In [2]:
def AddHighCrimeFieldAndDumpStatistics(file_name):
    try:
        crime_data = pd.read_csv(file_name)
        # Create a new field called highCrime which is true if the ViolentCrimesPerPop field is greater than 0.1
        crime_data['highCrime'] = np.where(crime_data['ViolentCrimesPerPop'] >= 0.1, 'true', 'false')
        high_crime_count = np.sum(crime_data['highCrime'] == 'true')
        low_crime_count = np.sum(crime_data['highCrime'] == 'false')

        high_crime_percentage = (high_crime_count / (high_crime_count + low_crime_count)) * 100
        low_crime_percentage = (low_crime_count / (high_crime_count + low_crime_count)) * 100

        print("HighCrime positive % is ", high_crime_percentage)
        print("HighCrime negative % is ", low_crime_percentage)
        
    except:
        print("unable to read ", file_name)
        exit()
    

#### This class provides base functionality for processing and classifying crime data
##### The classifiers are passed in by derived classes to the relevant methods of this class.
##### The functionality provided includes
##### 1.Running classification and dumping stats on the whole data set.
##### 2.Running KFold validation on the data set.
##### 3.cross_val_score() function to use the python libraries to score the classifier.
##### 4.get_top_features() which provides a list of top features.

In [3]:
class CrimePredictorBase:
    def __init__(self, file_name, clean):
        self.file_name = file_name
        self.clean = clean
        self.read_crime_data()
        self.process_crime_data()
        self.classification = True
        
    def read_crime_data(self):
        try:
            self.input_crime_data = pd.read_csv(self.file_name)
            print("Read: ", len(self.input_crime_data), " records")
        except:
            print("unable to read ", self.file_name)
            exit()
            
    def process_crime_data(self):
        # Delete the communityname column as it is not a predictor.
        # Additionally the GaussianNB model does not support strings.
        # We could have converted the column to an integer. However it is not a relevant
        # column. We can safely ignore it.
        del self.input_crime_data['communityname']
        del self.input_crime_data['fold']
        del self.input_crime_data['state']

        self.actual_crime_class = np.where(self.input_crime_data['ViolentCrimesPerPop'] >= 0.1, 'true', 'false')
        self.actual_crime = self.input_crime_data['ViolentCrimesPerPop']

        self.high_crime_input = self.input_crime_data.loc[self.input_crime_data['ViolentCrimesPerPop'] >= 0.1]
        self.low_crime_input = self.input_crime_data.loc[self.input_crime_data['ViolentCrimesPerPop'] < 0.1]
        self.high_crime_output = self.high_crime_input["ViolentCrimesPerPop"]
        self.low_crime_output = self.low_crime_input["ViolentCrimesPerPop"]
        
        del self.input_crime_data['ViolentCrimesPerPop']
        del self.high_crime_input['ViolentCrimesPerPop']
        del self.low_crime_input['ViolentCrimesPerPop']
        
        if (self.clean == False):
            input_record_list = []
            # The default imputer has trouble with strings other than NaN. To workaround
            # this replace all ?s in the columns with NaNs
            temp_crime_data = self.input_crime_data
            for feature_index in temp_crime_data.dtypes.index:
                temp_crime_data[feature_index]= temp_crime_data[feature_index].replace('?', 'NaN')

            imputer = Imputer(strategy="mean", axis=0)
            temp_crime_data = imputer.fit_transform(temp_crime_data)
            # After imputing the DataFrame loses the columns. We recreate the DataFrame with the columns.
            # Clear the input data frame and then create a new instance from the imputed
            # DataFrame. We make sure to retain the columns.
            self.input_crime_data = self.input_crime_data[0:0]
            self.input_crime_data = pd.DataFrame(temp_crime_data, columns=self.input_crime_data.columns)

    # Returns the high crime statistics based on the array |crime_output| passed in.
    def get_crime_rate_statistics(self):
        high_crime_count = (self.actual_crime_class == 'true').sum()
        low_crime_count = (self.actual_crime_class == 'false').sum()
    
        high_crime_percentage = (high_crime_count / (high_crime_count + low_crime_count)) * 100
        low_crime_percentage = (low_crime_count / (high_crime_count + low_crime_count)) * 100

        return high_crime_count, low_crime_count, high_crime_percentage, low_crime_percentage
        
    # Prints the high crime statistics based on the |crime_column|.
    def dump_crime_statistics(self):
        high_crime_count, low_crime_count, high_crime_percentage, low_crime_percentage = self.get_crime_rate_statistics()

        assert(high_crime_count + low_crime_count == len(self.actual_crime))

        print("High crime count:", high_crime_count)
        print("Low crime count:", low_crime_count)

        print("High crime percentage:", high_crime_percentage)
        print("Low crime percentage:", low_crime_percentage)
        
    # Runs the |classifier| on the DataFrame |input_crime_data|. The desired output is
    # |actual_crime|. Returns the predicted crime.
    def run_classifier_on_whole_dataset(self, classifier):
        if (self.classification == True):
            labelencoder_Y= LabelEncoder()
            encoded_output = labelencoder_Y.fit_transform(self.actual_crime_class)
        else:
            encoded_output = self.actual_crime
        
        classifier.fit(self.input_crime_data, encoded_output)
        predicted_crime = classifier.predict(self.input_crime_data)
        self.dump_statistics(encoded_output, predicted_crime)
        return predicted_crime
    
    def run_classifier_with_train_test_split(self, classifier, size):
        if (self.classification == True):
            labelencoder_Y= LabelEncoder()
            encoded_output = labelencoder_Y.fit_transform(self.actual_crime_class)
        else:
            encoded_output = self.actual_crime
        
        X_train, X_test, Y_train, Y_test= model_selection.train_test_split(self.input_crime_data, encoded_output, test_size=size, random_state=0)
        classifier.fit(X_train, Y_train)
        predicted = classifier.predict(X_test)
        self.dump_statistics(Y_test, predicted)
    
    # Runs the |classifier| with cross validation on the DataFrame |input_crime_data|.
    # The desired output is |actual_crime|. Returns the predicted crime.
    def run_classifier_with_cross_validation(self, classifier, folds,):
        print("Running classifier with cross validation for ", folds, " folds")

        if (self.classification == True):
            labelencoder_Y= LabelEncoder()
            encoded_output = labelencoder_Y.fit_transform(self.actual_crime_class)
        else:
            encoded_output = self.actual_crime
        
        kf = KFold(len(self.input_crime_data), folds, shuffle=True, random_state=None)
    
        for train_index, test_index in kf:
            test_data_frame = pd.DataFrame(columns = self.input_crime_data.columns)
            input_record_list = []
            crime_rate = []
            for next_training_index in train_index:
                input_record_list.append(self.input_crime_data.ix[next_training_index])
                crime_rate.append(encoded_output[next_training_index])
            training_data_frame = pd.DataFrame(input_record_list)
            
            classifier.fit(training_data_frame, crime_rate)

            input_record_list = []
            crime_rate = []
            for next_test_index in test_index:
                input_record_list.append(self.input_crime_data.ix[next_test_index])
                crime_rate.append(self.actual_crime[next_test_index])
            test_data_frame = pd.DataFrame(input_record_list)
            predicted_crime = classifier.predict(test_data_frame)
            #self.dump_statistics(encoded_output, predicted_crime)
    
    def cross_val_score(self, classifier, folds):
        print("Cross Validation scores for ", folds, "folds")
        if (self.classification == True):
            actual_crime = np.where(self.actual_crime_class == "true", 1, 0)
            precision = np.mean(model_selection.cross_val_score(classifier, self.input_crime_data, actual_crime, cv=folds, scoring='precision'))
            accuracy = np.mean(model_selection.cross_val_score(classifier, self.input_crime_data, actual_crime, cv=folds, scoring='accuracy'))
            recall = np.mean(model_selection.cross_val_score(classifier, self.input_crime_data, actual_crime, cv=folds, scoring='recall'))

            print("Accuracy: ", accuracy)
            print("Precision: ", precision)
            print("Recall: ", recall)
            return accuracy, precision, recall
        else:
            actual_crime = self.actual_crime
            mean_square_error = np.mean(model_selection.cross_val_score(classifier, self.input_crime_data, actual_crime, cv=folds, scoring='neg_mean_squared_error'))
            print("MSE is ", abs(mean_square_error))
            return abs(mean_square_error)

    # Gets the top features used for prediction. The |feature_count| is the count of
    # features desired. The |input_crime_data| is the input DataFrame. The |actual_crime|
    # is the desired output.
    def get_top_features(self, feature_count, feature_scores, feature_ranker):
        coefs_with_feature_names = sorted(zip(feature_scores, self.input_crime_data.dtypes.index))
    
        # We use a max heap to determine the top features. The key is the absolute value of the feature weight.
        predictive_feature_heap = []
        for coefficient, feature_name in coefs_with_feature_names:
            heapq.heappush(predictive_feature_heap, feature_ranker(coefficient, feature_name))

        # The top features are returned in a list.
        top_feature_list = []
        count_features = 0
        for next_feature in predictive_feature_heap:
            feature = heapq.heappop(predictive_feature_heap)
            top_feature_list.append(feature.feature_name)
            count_features = count_features + 1
            if (count_features == feature_count):
                break
    
        return top_feature_list
    
    # Dumps the statistics for the predicted crime vs actual crime.        
    def dump_statistics(self, actual_crime, predicted_crime):
        if (self.classification == False):
            print("MSE is", metrics.mean_squared_error(actual_crime, predicted_crime))
            return
        cm = confusion_matrix(actual_crime, predicted_crime)
        print("Confusion matrix", cm)
        accuracy = accuracy_score(actual_crime, predicted_crime)
        print("Accuracy", accuracy)
        prec = precision_score(actual_crime, predicted_crime)
        print("Precision", prec)
        rec= recall_score(actual_crime, predicted_crime)
        print("Recall", rec)
    
    def set_poly(self, use_poly, poly_degree):
        self.use_poly = use_poly
        self.degree = poly_degree
        if (self.use_poly):
            self.poly = PolynomialFeatures(degree=poly_degree)
            self.input_crime_data = self.poly.fit_transform(self.input_crime_data)


We use a Max heap to get the top 10 most predictive features.
We use a max heap to get the top features for the LinearSVC classifier. The key is the absolute value of the feature weight.

In [4]:
class MaxHeapFeatures(object):
        def __init__(self, feature_weight, feature_name):
            self.absolute_feature_weight = abs(feature_weight)
            self.feature_name = feature_name
    
        def __lt__(self, other):
            return self.absolute_feature_weight > other.absolute_feature_weight

        def __eq__(self, other):
            return self.absolute_feature_weight == other.absolute_feature_weight

        def __str__(self):
            return str(self.absolute_feature_weight)
    
        def feature_name(self):
            return self.feature_name

---

## 1. Decision Tree

Provides DecisionTree classification functionality.                    

In [5]:
class DecisionTreeCrimePredictor(CrimePredictorBase):
    def __init__(self, file_name, clean):
        CrimePredictorBase.__init__(self, file_name, clean)
        self.classifier = DecisionTreeClassifier(criterion= 'entropy', random_state=0)
        
    def cross_val_score(self, folds):
        print("DecisionTree crossvalidation score for ", folds, " folds is")
        CrimePredictorBase.cross_val_score(self, self.classifier, folds)
        
    def run_classifier_on_whole_dataset(self):
        print("Running DecisionTreeClassifier on whole data set")
        return CrimePredictorBase.run_classifier_on_whole_dataset(self, self.classifier)

    def run_classifier_with_train_test_split(self, size):
        print("Running DecisionTreeClassifier with train test split: ", size)
        return CrimePredictorBase.run_classifier_with_train_test_split(self, self.classifier, size)
        
    # Gets the top features used for prediction. The |feature_count| is the count of
    # features desired. The |input_crime_data| is the input DataFrame. The |actual_crime|
    # is the desired output.
    def dump_top_features(self, feature_count):
        self.classifier.fit(self.input_crime_data, self.actual_crime_class)
        top_feature_list = CrimePredictorBase.get_top_features(self, feature_count, self.classifier.feature_importances_, MaxHeapFeatures)
        print("\n\nTop feature list for DecisionTreeClassifier is")
    
        for feature in top_feature_list:
            print(feature)


#### Question 1 a. Create a new field “highCrime” which is true if the crime rate per capita (ViolentCrimesPerPop) is greater than 0.1, and false otherwise. What are the percentage of positive and negative instances in the dataset?


In [6]:
print("\n ****** High crime statistics test ******\n")
AddHighCrimeFieldAndDumpStatistics("communities-crime-clean.csv")




 ****** High crime statistics test ******

HighCrime positive % is  65.9307576518
HighCrime negative % is  34.0692423482


### b. DecisionTreeClassifier to learn a decision tree to predict highCrime on the entire dataset

In [7]:
print("\n****** DecisionTreeClassifier tests ******\n")

decision_tree_crime_predictor = DecisionTreeCrimePredictor("communities-crime-clean.csv", True)





****** DecisionTreeClassifier tests ******

Read:  1993  records


#### Question 1b i. What are the training accuracy, precision, and recall for this tree? 


In [8]:
decision_tree_crime_predictor.run_classifier_on_whole_dataset()

Running DecisionTreeClassifier on whole data set
Confusion matrix [[ 679    0]
 [   0 1314]]
Accuracy 1.0
Precision 1.0
Recall 1.0


array([0, 1, 1, ..., 1, 1, 0])

#### Decision Tree Classification on Train-Test split with 25% for test set

In [9]:
decision_tree_crime_predictor.run_classifier_with_train_test_split(0.25)

Running DecisionTreeClassifier with train test split:  0.25
Confusion matrix [[115  65]
 [ 46 273]]
Accuracy 0.77755511022
Precision 0.807692307692
Recall 0.855799373041


#### Top 10 features

In [10]:
decision_tree_crime_predictor.dump_top_features(10)



Top feature list for DecisionTreeClassifier is
PctKids2Par
racePctWhite
racePctHisp
pctWInvInc
HousVacant
pctWRetire
householdsize
agePct65up
HispPerCap
AsianPerCap


#### Question 1b ii. What are the main features used for classification?. Can you explain why they make sense or not.
Answer. The top 10 features used for classification are as below in order of importance.
1. PctKids2Par
2. racePctWhite
3. racePctHisp
4. pctWInvInc
5. HousVacant
6. pctWRetire
7. householdsize
8. agePct65up
9. HispPerCap
10. AsianPerCap

They make sense due to the following reasons.
1. Crime rate is generally high in poorer neighborhoods. Here education is generally low,
   unemployment is high, etc. Sadly these neighborhoods are mostly black neigborhoods, people    of color, etc. Hence it makes sense that racePctHisp, HispPerCap, PctKids2Par are in the      top 10 features. 

   PctKids2Par
   Families with more children struggle to provide for the kids, leading to crime.
   
   racePctHisp,HispPerCap
   Hispanic neighborhoods associate with poorer neighborhoods with people of color
   Sadly associated with high crime.

2. White neighborhoods are generally affluent neighborhoods. Hence low crime. Thus makes sense
   that racePctWhite is in the top 10 features.

3. AsianPerCap
   Asians are generally educated immigrants. While there are always exceptions, in the general
   case it makes sense that neighborhoods with more asians are low in crime.


### c. Cross validation 

#### Question 1c i. What are the 10-fold cross-validation accuracy, precision, and recall?

In [11]:
decision_tree_crime_predictor.cross_val_score(10)


DecisionTree crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
Accuracy:  0.754604360185
Precision:  0.817083652408
Recall:  0.812618551932


### Question 1c ii: Why are they different from the results in the previous test?
Answer: The first test which uses the same dataset for training and testing reports results as Accuracy = 1.0, Precision = 1.0, Recall = 1.0. The 100% success results are expected because  we are training and testing on the same data, which is cheating. The cross validation tests randomly split the dataset into training and test sets over a number of folds. As a result it generally makes sense that the results you get from cross validation are a better predictor of the reliability of the classifier. Makes sense that the cross validation test accuracy/precision/recall are lower than training and testing on the same dataset.



---

## 2. Linear Classification

Provides GaussianNB classification functionality.

In [12]:
class GaussianNBCrimePredictor(CrimePredictorBase):
    def __init__(self, file_name, clean):
        CrimePredictorBase.__init__(self, file_name, clean)
        self.classifier = GaussianNB()

    def run_classifier_on_whole_dataset(self):
        print("Running GaussianNB on whole data set")
        return CrimePredictorBase.run_classifier_on_whole_dataset(self, self.classifier)
    
    def run_classifier_with_cross_validation(self, folds):
        return CrimePredictorBase.run_classifier_with_cross_validation(self, self.classifier, folds)
    
    def cross_val_score(self, folds):
        print("GaussianNB crossvalidation score for ", folds, " folds is")
        CrimePredictorBase.cross_val_score(self, self.classifier, folds)
        
    # We use a Max heap to get the top 10 most predictive features.
    class MaxHeapFeatures(object):
        def __init__(self, absolute_mean_difference, feature_name):
            self.absolute_mean_difference = absolute_mean_difference
            self.feature_name = feature_name
    
        def __lt__(self, other):
            return self.absolute_mean_difference > other.absolute_mean_difference

        def __eq__(self, other):
            return self.absolute_mean_difference == other.absolute_mean_difference

        def __str__(self):
            return str(self.absolute_mean_difference)
    
        def feature_name(self):
            return self.feature_name
        
    def dump_top_features(self, feature_count):
        print("\n\nTop feature list for GaussianNB is")
        
        # We use a max heap to determine the top features. The key is the diff between positive and negative
        # feature mean for the desired classes.
        predictive_feature_heap = []
        for feature_index in self.input_crime_data.dtypes.index:
            mean_positive_crime_for_feature = np.sum(self.high_crime_input[feature_index]) / len(self.high_crime_input[feature_index])
            mean_negative_crime_for_feature = np.sum(self.low_crime_input[feature_index]) / len(self.low_crime_input[feature_index])
            diff = abs(mean_positive_crime_for_feature - mean_negative_crime_for_feature)
            heapq.heappush(predictive_feature_heap, self.MaxHeapFeatures(abs(diff), feature_index))

        # The top features are returned in a list.
        top_feature_list = []
        count_features = 0
        for next_feature in predictive_feature_heap:
            feature = heapq.heappop(predictive_feature_heap)
            top_feature_list.append(feature.feature_name)
            count_features = count_features + 1
            if (count_features == feature_count):
                break
            
        for feature in top_feature_list:
            print(feature)

In [13]:
print("\n****** GaussianNB tests ******\n")

gaussian_crime_predictor = GaussianNBCrimePredictor("communities-crime-clean.csv", True)
gaussian_crime_predictor.run_classifier_on_whole_dataset()



****** GaussianNB tests ******

Read:  1993  records
Running GaussianNB on whole data set
Confusion matrix [[619  60]
 [394 920]]
Accuracy 0.772202709483
Precision 0.938775510204
Recall 0.700152207002


array([0, 1, 1, ..., 0, 0, 0])

#### Question 2a i. What is the 10-fold cross-validation accuracy, precision, and recall for this method?


In [14]:
gaussian_crime_predictor.cross_val_score(10)

GaussianNB crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
Accuracy:  0.761584386579
Precision:  0.925010462289
Recall:  0.699178811011


#### 10 most predictive features

In [15]:
gaussian_crime_predictor.dump_top_features(10)



Top feature list for GaussianNB is
PctKids2Par
racePctWhite
PctFam2Par
PctYoungKids2Par
PctIlleg
PctHousNoPhone
TotalPctDiv
pctWPubAsst
racepctblack
FemalePctDiv


#### Question 2a i. What are the 10 most predictive features? This can be measured by the normalized absolute difference of means for the feature between the two classes:
Answer: The 10 most predictive features for Gaussian Naive Bayes are as below:
1. PctKids2Par
2. racePctWhite
3. PctFam2Par
4. PctYoungKids2Par
5. PctIlleg
6. PctHousNoPhone
7. TotalPctDiv
8. pctWPubAsst
9. racepctblack
10. FemalePctDiv

Measuring this as the difference between the means of the features between high crime and low crime makes sense, as it helps measure how the feature affects crime. For e.g. the top 2 features are PctKids2Par and racePctWhite. We generally know that white neighborhoods are richer. Hence they are generally low crime. Hence the difference between the percentage of white people for high and low crime makes a lot of sense. If the percentage of high people is higher, low crime and vice versa. We can argue on similar lines for  PctKids2Par (Families with more kids stuggle to provide for them. Hence high crime, etc.) 



#### Question 2a iii: How do these results compare with Decision trees above?
The top 2 features PctKids2PerPar and racePctWhite are common for Decision trees and Gaussian Naive Bayes.
However Gaussian Naive Bayes has these features in the top 6 

PctFam2Par
Neighborhoods where more kids have 2 parents could classify as low crime. Because 2 parents
could mean more income to the family. Hence kids are better cared for.

PctYoungKids2Par
Young kids under 2 and 4 are not associated with crime. Makes sense if this percentage is higher than low crime.

PctIlleg
Illegal immigrants are associated with poorer neighborhoods. Hence makes sense that there is more crime here.

PctHousNoPhone
No phone equals poorer families. Makes sense that more crime here.

All in all Gaussian Naive Bayes appears to have features which better predict crime at least for this dataset
in the top 10.



---

### b. LinearSVC to learn a linear Support Vector Machine model to predict highCrime.


Provides SVM classification functionality.    

In [16]:
class SVMCrimePredictor(CrimePredictorBase):
    def __init__(self, file_name, svm_kernel, classification, clean):
        CrimePredictorBase.__init__(self, file_name, clean)
        self.kernel = svm_kernel
        self.classification = classification
        if (self.classification == True):
            if (svm_kernel == 'linear'):
                self.classifier = svm.LinearSVC()
            else:
                self.classifier = svm.SVC(kernel=svm_kernel)
        else:
            self.classifier = svm.SVR(kernel=svm_kernel)

    def run_classifier_on_whole_dataset(self):
        if (self.kernel == "linear"):
            print("Running LinearSVC on whole data set")
        else:
            print("Running ", self.kernel, "on whole data set")
        return CrimePredictorBase.run_classifier_on_whole_dataset(self, self.classifier)
    
    def run_classifier_with_cross_validation(self, folds):
        return CrimePredictorBase.run_classifier_with_cross_validation(self, self.classifier, folds)
    
    def cross_val_score(self, folds):
        if (self.kernel == "linear"):
            print("LinearSVC crossvalidation score for ", folds, " folds is")
        else:
            print("SVM kernel ", self.kernel, " crossvalidation score for ", folds, " folds is")
        CrimePredictorBase.cross_val_score(self, self.classifier, folds)
        
    # We use a Max heap to get the top 10 most predictive features.
    # We use a max heap to get the top features for the LinearSVC classifier. The key is the 
    # absolute value of the feature weight.
    class MaxHeapFeatures(object):
        def __init__(self, feature_weight, feature_name):
            self.absolute_feature_weight = abs(feature_weight)
            self.feature_name = feature_name
    
        def __lt__(self, other):
            return self.absolute_feature_weight > other.absolute_feature_weight

        def __eq__(self, other):
            return self.absolute_feature_weight == other.absolute_feature_weight

        def __str__(self):
            return str(self.absolute_feature_weight)
    
        def feature_name(self):
            return self.feature_name
        
    def dump_top_features(self, feature_count):
        if (self.kernel == "linear"):
            self.classifier.fit(self.input_crime_data, self.actual_crime_class)
            top_feature_list = CrimePredictorBase.get_top_features(self, feature_count, self.classifier.coef_[0], self.MaxHeapFeatures)
            print("\n\nTop feature list for kernel ", self.kernel, " is")
    
            for feature in top_feature_list:
                print(feature)



In [17]:
print("\n****** LinearSVC tests ******\n")

linear_svc_crime_predictor = SVMCrimePredictor("communities-crime-clean.csv", "linear", True, True)
linear_svc_crime_predictor.dump_crime_statistics()
linear_svc_crime_predictor.run_classifier_on_whole_dataset()



****** LinearSVC tests ******

Read:  1993  records
High crime count: 1314
Low crime count: 679
High crime percentage: 65.9307576518
Low crime percentage: 34.0692423482
Running LinearSVC on whole data set
Confusion matrix [[ 539  140]
 [ 151 1163]]
Accuracy 0.853988961365
Precision 0.892555640829
Recall 0.885083713851


array([1, 1, 1, ..., 0, 1, 1])

#### Question 2b i. What is the 10-fold cross-validation accuracy, precision, and recall for this method?


In [18]:
linear_svc_crime_predictor.cross_val_score(10)

LinearSVC crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
Accuracy:  0.796212603421
Precision:  0.854739828064
Recall:  0.84141221374


#### Question 2b ii.What are the 10 most predictive features? This can be measured by the absolute feature weights in the model. Why do these make sense (or not)?


In [19]:

linear_svc_crime_predictor.dump_top_features(10)




Top feature list for kernel  linear  is
pctWInvInc
PctKids2Par
RentHighQ
PersPerOccupHous
MalePctDivorce
racePctWhite
agePct65up
HousVacant
racePctHisp
MedRent


Answer: The top 10 predictive features for LinearSVC are as below:
1. pctWInvInc
2. PctKids2Par
3. RentHighQ
4. PersPerOccupHous
5. MalePctDivorce
6. racePctWhite
7. agePct65up
8. HousVacant
9. racePctHisp
10. MedRent

Linear support vector machine attempt to find a linear boundary between the two classes. The regression line
which separates the two classes is determined by the values of the weights(coefficients) associated with the features.
Higher the weight the better predictor the feature is. This method of associating the feature importance
based on the feature weight (coefficient) makes perfect sense.



#### Question  2b iii. How do these results compare with your results from decision trees, above?
Answer: pctWInvInc, PctKids2Par, racePctWhite, racePctHisp, agePct65up are common in both
classifiers. The ordering is different though. For e.g. PctKids2Par is the top feature in
Decision tree and is the second most important feature in LinearSVC and so on.


---

## 3. Regression


#### a. LinearRegression to learn a linear model directly predicting the crime rate per capita (ViolentCrimesPerPop).


Provides LinearRegression prediction.   

In [20]:
class LinearRegressionCrimePredictor(CrimePredictorBase):
    def __init__(self, file_name, use_poly, degree, clean):
        CrimePredictorBase.__init__(self, file_name, clean)
        CrimePredictorBase.set_poly(self, use_poly, degree)
        self.classification = False
        self.classifier = LinearRegression()
        
    def cross_val_score(self, folds):
        print("LinearRegression crossvalidation score for ", folds, " folds is")
        CrimePredictorBase.cross_val_score(self, self.classifier, folds)
        
    def run_classifier_on_whole_dataset(self):
        print("Running LinearRegresion on whole data set")
        return CrimePredictorBase.run_classifier_on_whole_dataset(self, self.classifier)
        
    # Gets the top features used for prediction. The |feature_count| is the count of
    # features desired. The |input_crime_data| is the input DataFrame. The |actual_crime|
    # is the desired output.
    def dump_top_features(self, feature_count):
        self.classifier.fit(self.high_crime_input, self.high_crime_output)
        top_feature_list = CrimePredictorBase.get_top_features(self, feature_count, self.classifier.coef_, SVMCrimePredictor.MaxHeapFeatures)
        print("\n\nTop feature list for LinearRegression high crime rate is")
    
        for feature in top_feature_list:
            print(feature)

        self.classifier.fit(self.low_crime_input, self.low_crime_output)
        top_feature_list = CrimePredictorBase.get_top_features(self, feature_count, self.classifier.coef_, SVMCrimePredictor.MaxHeapFeatures)
        print("\n\nTop feature list for LinearRegression low crime rate is")
    
        for feature in top_feature_list:
            print(feature)


In [21]:
print("\n****** LinearRegression tests ******\n")

linear_regression_crime_predictor = LinearRegressionCrimePredictor("communities-crime-clean.csv", False, 0, True)




****** LinearRegression tests ******

Read:  1993  records


#### Question 3a i.Using 10-fold cross-validation, what is the estimated mean-squared-error (MSE) of the model?


In [22]:
linear_regression_crime_predictor.cross_val_score(10)

LinearRegression crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
MSE is  0.0200939693044


#### Question 3a ii.What is the MSE on the training set (train on all the data then test on it all)?


In [23]:
linear_regression_crime_predictor.run_classifier_on_whole_dataset()



Running LinearRegresion on whole data set
MSE is 0.0165167748803


array([ 0.20829089,  0.3241512 ,  0.5673497 , ...,  0.08907789,
        0.15564742,  0.17669898])

#### Question 3a iii.What features are most predictive of a high crime rate? A low crime rate?


In [24]:
linear_regression_crime_predictor.dump_top_features(10)



Top feature list for LinearRegression high crime rate is
PctPersOwnOccup
PctHousOwnOcc
MedRent
PersPerOccupHous
RentLowQ
PctRecImmig8
PersPerRentOccHous
OwnOccLowQuart
population
TotalPctDiv


Top feature list for LinearRegression low crime rate is
population
TotalPctDiv
numbUrban
PersPerFam
FemalePctDiv
MalePctDivorce
PctLargHouseFam
racePctWhite
NumImmig
PctLargHouseOccup


### b. Ridge regression

Provides RidgeRegression prediction.  

In [25]:
class RidgeRegressionCrimePredictor(CrimePredictorBase):
    def __init__(self, file_name, alphas_array, clean):
        CrimePredictorBase.__init__(self, file_name, clean)
        self.classification = False
        self.alphas = alphas_array
        self.classifier = RidgeCV(alphas=alphas_array)
        
    def cross_val_score(self, folds):
        print("RidgeCV crossvalidation score for ", folds, " folds is")
        CrimePredictorBase.cross_val_score(self, self.classifier, folds)
        
    def get_best_alpha(self):
        self.classifier.fit(self.input_crime_data, self.actual_crime)
        print("Best Alpha is ", self.classifier.alpha_)
        
    def run_classifier_on_whole_dataset(self):
        print("Running RidgeCV on whole data set")
        return CrimePredictorBase.run_classifier_on_whole_dataset(self, self.classifier)
        
    # Gets the top features used for prediction. The |feature_count| is the count of
    # features desired. The |input_crime_data| is the input DataFrame. The |actual_crime|
    # is the desired output.
    def dump_top_features(self, feature_count):
        self.classifier.fit(self.high_crime_input, self.high_crime_output)
        top_feature_list = CrimePredictorBase.get_top_features(self, feature_count, 
                                                               self.classifier.coef_, SVMCrimePredictor.MaxHeapFeatures)
        print("\n\nTop feature list for RidgeCV high crime rate is")
    
        for feature in top_feature_list:
            print(feature)

        self.classifier.fit(self.low_crime_input, self.low_crime_output)
        top_feature_list = CrimePredictorBase.get_top_features(self, feature_count, 
                                                               self.classifier.coef_, SVMCrimePredictor.MaxHeapFeatures)
        print("\n\nTop feature list for RidgeCV low crime rate is")
    
        for feature in top_feature_list:
            print(feature)


In [26]:
print("\n****** RidgeRegression tests ******\n")

alphas = np.array([10, 1, 0.1, 0.01, 0.001])
ridge_regression_crime_predictor = RidgeRegressionCrimePredictor("communities-crime-clean.csv"
                                                                 , alphas, True)





****** RidgeRegression tests ******

Read:  1993  records


#### Question 3b i. What is the estimated MSE of the model under 10-fold CV?

In [27]:
ridge_regression_crime_predictor.cross_val_score(10)

RidgeCV crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
MSE is  0.0197950213482


#### Question 3b ii. What is the MSE on the training set (train on all the data then test on it all)?


In [28]:
ridge_regression_crime_predictor.run_classifier_on_whole_dataset()

Running RidgeCV on whole data set
MSE is 0.0167635291552


array([ 0.17213058,  0.30878933,  0.55261082, ...,  0.10295043,
        0.16031921,  0.16959889])

#### 3b iii. What is the best alpha?


In [29]:
ridge_regression_crime_predictor.get_best_alpha()

Best Alpha is  1.0


#### Question 3b iv: What does this say about the amount of overfitting in linear regression for this problem?
Cross validation Mean square error for Linear regression is 0.0200939693044
Mean square error on whole data set is 0.0165167748803

Cross validation Mean square error for Ridge regression is 0.0197950213482
Mean square error on whole data set is 0.0167635291552

The cross validation scores are a better predictor of how the classifier will
perform against real world data. At least for this dataset, the MSE for Linear regression
is higher than Ridge regression, which indicates that there is some amount of overfitting
in the Linear regression model.



### c. Polynomial regression

In [30]:
print("\n****** LinearRegression second order polynomial features tests ******\n")

linear_regression_crime_predictor = LinearRegressionCrimePredictor("communities-crime-clean.csv",
                                                                   True, 2, True)



****** LinearRegression second order polynomial features tests ******

Read:  1993  records


#### Question 3c i. What is the estimated MSE of the model under 10-fold CV?


In [31]:
linear_regression_crime_predictor.cross_val_score(10)

LinearRegression crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
MSE is  0.129898157392


#### Question 3c ii. What is the MSE on the training set (train on all the data then test on it all)?


In [38]:
linear_regression_crime_predictor.run_classifier_on_whole_dataset()

Running LinearRegresion on whole data set
MSE is 1.39689621363e-28


array([ 0.06,  0.14,  1.  , ...,  0.12,  0.27,  0.06])

#### Question 3c iii. Does this mean the quadratic model is better than the linear model for this problem?
Answer:
Cross validation Mean square error for the quadratic second order polynomial test is 
0.129898157392
It is  even lower when run on the whole dataset. 1.39689621363e-28
This does not necessarily mean the second order polynomial is better. While the MSE
for cross validation is lower for the second order polynomial, these models are also
known to overfit the data leading to poor performance when exposed to real world data.



---

## 4. Dirty Data


In [39]:
print("\n****** DecisionTreeClassifier tests on full dataset ******\n")

decision_tree_crime_predictor = DecisionTreeCrimePredictor("communities-crime-full.csv", False)
decision_tree_crime_predictor.dump_top_features(10)

print("\n****** End of DecisionTreeClassifier tests ******\n")



****** DecisionTreeClassifier tests on full dataset ******

Read:  1994  records


Top feature list for DecisionTreeClassifier is
PctKids2Par
racePctWhite
pctWInvInc
racePctHisp
PctWOFullPlumb
HousVacant
MalePctNevMarr
pctWRetire
perCapInc
AsianPerCap

****** End of DecisionTreeClassifier tests ******



#### Decision Tree Classification on whole data set

In [40]:
decision_tree_crime_predictor.run_classifier_on_whole_dataset()

Running DecisionTreeClassifier on whole data set
Confusion matrix [[ 679    0]
 [   0 1315]]
Accuracy 1.0
Precision 1.0
Recall 1.0


array([1, 1, 1, ..., 1, 1, 1])

#### Decision Tree Classification on Train-Test split with 25% for test set

In [41]:
decision_tree_crime_predictor.run_classifier_with_train_test_split(0.25)

Running DecisionTreeClassifier with train test split:  0.25
Confusion matrix [[123  61]
 [ 40 275]]
Accuracy 0.797595190381
Precision 0.818452380952
Recall 0.873015873016


#### CV results for full (non-clean) dataset

In [42]:
decision_tree_crime_predictor.cross_val_score(10)

DecisionTree crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
Accuracy:  0.775355692604
Precision:  0.83444175454
Recall:  0.823577376822


#### Question 4a. Are the CV results better or worse? What does this say about the effect of missing values?
Answer:
The Cross validation results on the full dataset were better than the clean dataset.
This indicates that the missing values had a lot of effect on the performance of the classifier.
Additionally it also means that the method of imputing the missing values based on the average
may not be desirable. This is because averages are generally sensitive to outliers. Perhaps
the median may be a better appoach here.

---

## 5.Teams 

### Non-linear SVM with RBF kernel- clean dataset

In [43]:
print("\n****** Nonlinear SVM with RBF kernel tests ******\n")

svm_rbf_crime_predictor = SVMCrimePredictor("communities-crime-clean.csv", "rbf", True, True)

svm_rbf_crime_predictor.run_classifier_on_whole_dataset()
svm_rbf_crime_predictor.cross_val_score(10)


****** Nonlinear SVM with RBF kernel tests ******

Read:  1993  records
Running  rbf on whole data set
Confusion matrix [[ 502  177]
 [ 160 1154]]
Accuracy 0.830908178625
Precision 0.86701728024
Recall 0.878234398782
SVM kernel  rbf  crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
Accuracy:  0.81072493782
Precision:  0.854344127903
Recall:  0.866568355309


### Non-linear SVM with RBF kernel- full dataset

In [44]:
print("\n****** Nonlinear SVM with RBF kernel tests on full dataset ******\n")

svm_rbf_crime_predictor = SVMCrimePredictor("communities-crime-full.csv", "rbf", True, False)
svm_rbf_crime_predictor.run_classifier_on_whole_dataset()
svm_rbf_crime_predictor.cross_val_score(10)


****** Nonlinear SVM with RBF kernel tests on full dataset ******

Read:  1994  records
Running  rbf on whole data set
Confusion matrix [[ 462  217]
 [   0 1315]]
Accuracy 0.891173520562
Precision 0.858355091384
Recall 1.0
SVM kernel  rbf  crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
Accuracy:  0.69862915588
Precision:  0.800246316994
Recall:  0.72323617858


### Logistic Regression

In [45]:
class LogisticRegressionCrimePredictor(CrimePredictorBase):
    def __init__(self, file_name, clean):
        CrimePredictorBase.__init__(self, file_name, clean)
        self.classifier = LogisticRegression()
        
    def cross_val_score(self, folds):
        print("Logistic regression crossvalidation score for ", folds, " folds is")
        CrimePredictorBase.cross_val_score(self, self.classifier, folds)
        
    # Gets the top features used for prediction. The |feature_count| is the count of
    # features desired. The |input_crime_data| is the input DataFrame. The |actual_crime|
    # is the desired output.
    def dump_top_features(self, feature_count):
        actual_crime = np.where(self.actual_crime_class == "true", 1, 0)
        self.classifier.fit(self.input_crime_data, actual_crime)
        top_feature_list = CrimePredictorBase.get_top_features(self, feature_count, self.classifier.coef_[0], SVMCrimePredictor.MaxHeapFeatures)
        print("\n\nTop feature list for LogisticRegression are")
    
        for feature in top_feature_list:
            print(feature)


#### Logistic Regression for clean dataset

In [46]:
print("\n****** LogisticRegression tests ******\n")
logistic_regression = LogisticRegressionCrimePredictor("communities-crime-clean.csv", True)

logistic_regression.dump_top_features(10)

logistic_regression.cross_val_score(10)


****** LogisticRegression tests ******

Read:  1993  records


Top feature list for LogisticRegression are
racepctblack
pctWInvInc
racePctWhite
racePctHisp
MalePctDivorce
PctKids2Par
PersPerRentOccHous
PctIlleg
HousVacant
agePct65up
Logistic regression crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
Accuracy:  0.810742754175
Precision:  0.863707911957
Recall:  0.854354614851


#### Logistic Regression with full dataset

In [47]:

print("\n****** LogisticRegression tests on full dataset ******\n")
logistic_regression = LogisticRegressionCrimePredictor("communities-crime-full.csv", False)

logistic_regression.dump_top_features(10)

logistic_regression.cross_val_score(10)


****** LogisticRegression tests on full dataset ******

Read:  1994  records


Top feature list for LogisticRegression are
TotalPctDiv
FemalePctDiv
PctIlleg
MalePctDivorce
pctWPubAsst
racepctblack
PctHousNoPhone
PctKids2Par
racePctWhite
PctPopUnderPov
Logistic regression crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
Accuracy:  0.73917222476
Precision:  0.7515699745
Recall:  0.940718251214


#### Question 5 2. What method gives the best results.
Answer: 
Results from the SVM rbf kernel are as below for cross validation.
Accuracy:  0.81072493782
Precision:  0.854344127903
Recall:  0.866568355309

We tried logistic regression as second option
Accuracy:  0.810742754175
Precision:  0.863707911957
Recall:  0.854354614851

The SVM rbf kernel and logistic regression classifiers both report higher
accuracy/precision/recall when compare with the other classifiers used above.

#### Question 5 3. What feature(s) seem to be most consistently predictive of high crime rates? How reliable is this conclusion?
Answer:

We got the top 10 features for Logistic regression:
racepctblack,
pctWInvInc,
racePctWhite,
racePctHisp,
MalePctDivorce,
PctKids2Par,
PersPerRentOccHouse,
PctIlleg,
ousVacant,
agePct65up

Comparing them with the top features for Decision tree, Gaussian Naive Bayes, Linear SVM,
it looks like the top features consistently predictive of high crime rates are as below:
1. racepctblack
2. racepctWhite.
3. racepctHisp
4. PctKids2Par
5. PctIlleg
6. MalePctDivorce.

Almost all the classifiers used report these features in their top feature list. Based on
the description of these features, it is very safe to suggest that values of these features
are a very good predictor of crime rate.

---

## 6.Extra Credit

### Support Vector Machine

#### SVM regression with RBF kernel on clean dataset

In [48]:
print("\n****** SVM regression with RBF kernel tests ******\n")

svm_rbf_regresion = SVMCrimePredictor("communities-crime-clean.csv", "rbf", False, True)
svm_rbf_regresion.cross_val_score(10)
svm_rbf_regresion.run_classifier_on_whole_dataset()




****** SVM regression with RBF kernel tests ******

Read:  1993  records
SVM kernel  rbf  crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
MSE is  0.0198258400241
Running  rbf on whole data set
MSE is 0.0172768123349


array([ 0.13682078,  0.2706237 ,  0.54588162, ...,  0.11856709,
        0.16033124,  0.15946845])

#### SVM regression with RBF kernel on full dataset

In [49]:
print("\n****** SVM regression with RBF kernel tests on full dataset ******\n")

svm_rbf_regresion = SVMCrimePredictor("communities-crime-full.csv", "rbf", False, False)
svm_rbf_regresion.cross_val_score(10)
svm_rbf_regresion.run_classifier_on_whole_dataset()



****** SVM regression with RBF kernel tests on full dataset ******

Read:  1994  records
SVM kernel  rbf  crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
MSE is  0.031435408819
Running  rbf on whole data set
MSE is 0.0158430959027


array([ 0.20832106,  0.30741923,  0.35682014, ...,  0.21330389,
        0.21330389,  0.48060249])

---

### K-Nearest Neighbor

In [50]:

class KNNCrimePredictor(CrimePredictorBase):
    def __init__(self, file_name, clean):
        CrimePredictorBase.__init__(self, file_name, clean)
        self.classifier = KNeighborsClassifier(n_neighbors= 5, metric= 'minkowski', p=2)
        
    def cross_val_score(self, folds):
        print("KNN crossvalidation score for ", folds, " folds is")
        CrimePredictorBase.cross_val_score(self, self.classifier, folds)

    def run_classifier_on_whole_dataset(self):
        print("Running KNN on whole data set")
        return CrimePredictorBase.run_classifier_on_whole_dataset(self, self.classifier)
        
    def run_classifier_with_train_test_split(self, size):
        print("Running KNN with train test split: ", size)
        return CrimePredictorBase.run_classifier_with_train_test_split(self, self.classifier, size)
        

In [51]:
print("\n****** KNN tests ******\n")
KNN_crime_predictor = KNNCrimePredictor("communities-crime-clean.csv", True)
KNN_crime_predictor.run_classifier_on_whole_dataset()
KNN_crime_predictor.cross_val_score(10)



****** KNN tests ******

Read:  1993  records
Running KNN on whole data set
Confusion matrix [[ 529  150]
 [ 110 1204]]
Accuracy 0.869543401907
Precision 0.889217134417
Recall 0.916286149163
KNN crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
Accuracy:  0.756549033044
Precision:  0.809046141881
Recall:  0.828492944714


----

### Random Forest Classifier

In [52]:
class RandomForestCrimePredictor(CrimePredictorBase):
    def __init__(self, file_name, clean):
        CrimePredictorBase.__init__(self, file_name, clean)
        self.classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
        
    def cross_val_score(self, folds):
        print("Random Forest crossvalidation score for ", folds, " folds is")
        CrimePredictorBase.cross_val_score(self, self.classifier, folds)

    def run_classifier_on_whole_dataset(self):
        print("Running RandomForestClassifier on whole data set")
        return CrimePredictorBase.run_classifier_on_whole_dataset(self, self.classifier)
        
    def run_classifier_with_train_test_split(self, size):
        print("Running Random Forest with train test split: ", size)
        return CrimePredictorBase.run_classifier_with_train_test_split(self, self.classifier, size)

    def dump_top_features(self, feature_count):
        self.classifier.fit(self.input_crime_data, self.actual_crime_class)
        top_feature_list = CrimePredictorBase.get_top_features(self, feature_count, self.classifier.feature_importances_, SVMCrimePredictor.MaxHeapFeatures)
        print("\n\nTop feature list for RandomForestClassifier is")
    
        for feature in top_feature_list:
            print(feature)
               

In [53]:
print("\n****** RandomForestClassifier tests ******\n")
RFC_crime_predictor = RandomForestCrimePredictor("communities-crime-clean.csv", True)
RFC_crime_predictor.run_classifier_on_whole_dataset()
RFC_crime_predictor.cross_val_score(10)
RFC_crime_predictor.dump_top_features(10)



****** RandomForestClassifier tests ******

Read:  1993  records
Running RandomForestClassifier on whole data set
Confusion matrix [[ 678    1]
 [  12 1302]]
Accuracy 0.993477170095
Precision 0.999232540292
Recall 0.990867579909
Random Forest crossvalidation score for  10  folds is
Cross Validation scores for  10 folds
Accuracy:  0.798205065733
Precision:  0.861442892349
Recall:  0.831534813787


Top feature list for RandomForestClassifier is
PctFam2Par
PctKids2Par
FemalePctDiv
racePctWhite
PctYoungKids2Par
PctPopUnderPov
PctIlleg
pctWInvInc
racepctblack
NumUnderPov


### Question 6b i. What method gives the best results?
We tried SVM regression with RBF kernel, K nearest neighbor and Random forest classifier.

Cross validation MSE for SVM regression is 0.0198258400241

Cross validation scores for K Nearest neighbor
Accuracy:  0.756549033044
Precision:  0.809046141881
Recall:  0.828492944714

Cross validation scores for RandomForestClassifier:
Accuracy:  0.798205065733
Precision:  0.861442892349
Recall:  0.831534813787

The SVM regression with RBF kernel gives similar results as Ridge regression. But using a non
linear kernel could potentially lead to overfitting. 

It looks like RandomForestClassifier gives the best results in the above list.



### Question 6b ii. What features seem to be consistently predictive of high crime rates? How reliable is this conclusion?
Answer:
The top feature list reported by RandomForestClassifier is as below:
PctFam2Par
PctKids2Par
FemalePctDiv
racePctWhite
PctYoungKids2Par
PctPopUnderPov
PctIlleg
pctWInvInc
racepctblack
NumUnderPov

Comparing these with the top features reported by Decision tree, LinearSVM, Gaussian NB, LogisticRegression, it looks like the top features consistently predictive of high crime rates are as below:
1. PctKids2Par
2. racePctWhite.
3. PctIlleg
4. racepctBlack

This is purely based on an intersection between the top features across all classifiers. These are common across all the classifiers we tried and hence are reliable predictors of crime rates.

