# <center>Machine Learning Project - Crime Prediction</center>

<li>
Jacob Jain Kallukalam (A20360754)
</li>
<li>FNU Vidhya (A20356005)
</li>

In [363]:
import zipfile
import pydot
import numpy as np
import subprocess
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LinearRegression
from sklearn.linear_model.ridge import Ridge,RidgeCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
import warnings

Extracting files from the zip folder

In [364]:
def extract():
    fh = open('Crime Prediction Data.zip', 'rb')
    z = zipfile.ZipFile(fh)
    for name in z.namelist():
        z.extract(name)
    fh.close()
extract()

# <u>DECISION TREES</u>

1. a) Calculating the percentage of positive and negative instances in the clean dataset.

In [365]:
crime_clean = pd.read_csv('Crime Prediction Data/communities-crime-clean.csv')
highCrime=[]
true_cnt=0
for x in crime_clean['ViolentCrimesPerPop']:
    if x>0.1:
        highCrime.append('True')
        true_cnt+=1
    else:
        highCrime.append('False')
#Creating the new column highCrime and appending to existing document
crime_clean['highCrime']=pd.Series(highCrime)
crime_clean.to_csv('Crime Prediction Data/communities-crime-clean.csv', sep=',', encoding='utf-8')
print('Percentage of positive instances:'+str((true_cnt/len(crime_clean['ViolentCrimesPerPop']))*100))
print('Percentage of negative instances:'+str(((len(crime_clean['ViolentCrimesPerPop'])-true_cnt)/len(crime_clean['ViolentCrimesPerPop']))*100))

Percentage of positive instances:62.719518314099346
Percentage of negative instances:37.280481685900654


In [366]:
features = list(crime_clean.columns[:len(crime_clean.columns)-2])

In [367]:
idx=0
comm={}
community_name=[]
for x in crime_clean['communityname']:
    if x not in comm:
        comm[x]=idx
        idx+=1
    community_name.append(comm[x])
crime_clean['communityname']=pd.Series(community_name)

### <u>Decision Tree Classifier</u>

The minimum number of samples required to split an internal node is kept as 20 to avoid overfitting

Saving the labels, gini values, samples and values into dt.dot file.

Plotting the decision tree from the data saved in dt.dot file on the basis of gini index and information gain.

In [368]:
y = crime_clean["highCrime"]
X = crime_clean[features]
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
dt.fit(X, y)
#Requires graphviz to print the decision tree
with open("dt.dot", 'w') as f:
    export_graphviz(dt, out_file=f,feature_names=features)


command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
try:
    subprocess.check_call(command)
except:
    exit("Could not run dot, ie graphviz, to produce visualization")


<img src="dt.png">


1. b) i. Calculating the training Accuracy, Precision, and Recall for the plotted decision tree.

In [369]:
ypredict=dt.predict(X)

TP=0
TN=0
FP=0
FN=0

for x in zip(ypredict,y):
    if(x[0]=='True' and x[1]=='True'):
        TP+=1
    elif x[0]=='False' and x[1]=='False':
        TN+=1
    elif x[0]=='True' and x[1]=='False':
        FP+=1
    elif x[0]=='False' and x[1]=='True':
        FN+=1

acc=(TP + TN)/(TP + TN + FP + FN)
pre=TP/(TP + FP)
rec=TP/(TP + FN)

print('accuracy:',acc)
print('precision:',pre)
print('recall:',rec)

accuracy: 0.9357752132463623
precision: 0.946656050955414
recall: 0.9512


1. b) ii. Finding the main features used for the decision tree classification.

The features with higher importance value are treated as the main features.

We are printing the top 10 features that contribute the most in decision making.

This makes sense as these are the features that contribute the maximum information gain for the splitting in the decision tree. The gini index is used to calculate the feature importance field which inturn is also a measure of information gain achieved by splitting through a particular feature at a particular instance in decision tree formation.

In [370]:
print(sorted(list(zip(features,dt.feature_importances_)),key=lambda x: -x[1])[:10])

[('PctKids2Par', 0.44176010471803651), ('racePctWhite', 0.10649545734925057), ('racePctHisp', 0.059843685126445598), ('PctEmplManu', 0.020406385475740262), ('HousVacant', 0.01755812277747543), ('communityname', 0.016744977095863738), ('HispPerCap', 0.013349381238338801), ('MedOwnCostPctInc', 0.012869430272866069), ('blackPerCap', 0.012404392394867894), ('state', 0.011674778724944698)]


### <u>Cross Val Score</u>
1. c) i. Calculating the 10-fold cross validation Accuracy, Precision and Recall for the above decision tree.

In [371]:
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
print('accuracy:',end="")
print(np.mean(cross_val_score(dt, X, y, cv=10, scoring='accuracy')))
ybin=[]
for x in y:
    if x=='True':
        ybin.append(1)
    else:
        ybin.append(0)
print('precision:',end="")
print(np.mean(cross_val_score(dt, X, ybin, cv=10, scoring='precision')))
print('recall:',end="")
print(np.mean(cross_val_score(dt, X, ybin, cv=10, scoring='recall')))

accuracy:0.716487437186
precision:0.78942069483
recall:0.7648


1. c) ii.Why are they different from the results in the previous test?

The cross validation does not train the model on entire dataset. It partition the data as training and testing sections. The train data fits the model and test data is used to check the model properties. Since the sample count available to the model is reduced and new unseen samples need to be predicted(test data), the model accuracy, precision and recall might decrease based on the correlation between the samples.
    The previous test was conducted on the very same data which was used to train the model and thus there is no unseen data to be predicted.


## <u>Linear Classification</u>

### <u>Gausian Naive Bayes</u>
2.a)ii.Calculating the 10-fold cross validation Accuracy, Precision and Recall for the Gaussian Naive Bayes.

In [372]:
gaus_full = GaussianNB()
gaus_full.fit(X, ybin)

ypredict = gaus_full.predict(X)
gaus = GaussianNB()
print('accuracy:',end="")
print(np.mean(cross_val_score(gaus, X, ybin, cv=10, scoring='accuracy')))
print('precision:',end="")
print(np.mean(cross_val_score(gaus, X, ybin, cv=10, scoring='precision')))
print('recall:',end="")
print(np.mean(cross_val_score(gaus, X, ybin, cv=10, scoring='recall')))



accuracy:0.759600502513
precision:0.913411959617
recall:0.6872


2.a) ii. Finding the 10 most predictive features. 

The larger this different, the more predictive the feature. Why do these make sense (or not)?

The larger the difference between mean value of each class for a feature, it is easier to predict the class of the feature. If they are close the margin of choosing the class become tedious.

In [373]:
uT=np.array(gaus_full.theta_[0])
uF=np.array(gaus_full.theta_[1])
sT=np.array(gaus_full.sigma_[0])
sF=np.array(gaus_full.sigma_[1])

fpredict=abs(uT-uF)/(sT+sF)

print(sorted(list(zip(features,fpredict)),key=lambda x: -x[1])[:10])

[('PctKids2Par', 4.9465652122871377), ('FemalePctDiv', 4.6954189323204307), ('PctFam2Par', 4.5446144318138764), ('pctWInvInc', 4.3415293287236265), ('TotalPctDiv', 4.3349519037702082), ('PctTeen2Par', 3.9552407232282452), ('MalePctDivorce', 3.9060868313950698), ('PctYoungKids2Par', 3.6166737824465711), ('PctIlleg', 3.4411529842773367), ('racePctWhite', 3.4174474764677996)]


 2.a) iii.	How do these results compare with your results from decision trees, above?

The Accuracy and Precision of the samples by Gaussian Naive Bayes is higher than higher than the results from decision tree method. But the Recall value is much lower than the desicion tree classification.

Gausian Naive Bayes works better when there is less samples to train. Decision Trees are generally efficient when more data is available. So the cross validation result for Accuracy and Precision sfavors naive bayes. Also decision trees tend to overfit the data and we have not done pruning to reduce it either in the above case.

### <u>Linear Support Vector Machine</u>

2.b) i. Calculating the 10-fold cross validation Accuracy, Precision and Recall for LinearSVC

In [374]:
svc_full = LinearSVC()
svc_full.fit(X, ybin)

ypredict = svc_full.predict(X)

svc = LinearSVC()
print('accuracy:',end="")
print(np.mean(cross_val_score(svc, X, ybin, cv=10, scoring='accuracy')))
print('precision:',end="")
print(np.mean(cross_val_score(svc, X, ybin, cv=10, scoring='precision')))
print('recall:',end="")
print(np.mean(cross_val_score(svc, X, ybin, cv=10, scoring='recall')))

f_weight=svc_full.coef_[0]

accuracy:0.684738693467
precision:0.761773787969
recall:0.788


2.b) ii. Finding the 10 most predictive features. How does this make sense?

If the svm finds one feature useful for separating the data, then the hyperplane would be orthogonal to that axis. So, you could say that the absolute size of the coefficient relative to the other ones gives an indication of how important the feature was for the separation. This theory is used for the weights present in the coefficient attribute of LinearSVC library function.

In [375]:
print(sorted(list(zip(features,np.absolute(f_weight))),key=lambda x: -x[1])[:10])

[('racepctblack', 0.91635858991918573), ('racePctWhite', 0.82037816377439088), ('MedYrHousBuilt', 0.81152837672404332), ('pctWFarmSelf', 0.6303891362610563), ('AsianPerCap', 0.60067105571661028), ('PctVacMore6Mos', 0.53640676211354621), ('MedNumBR', 0.51546971915518291), ('PctWOFullPlumb', 0.51097945627214858), ('PctWorkMomYoungKids', 0.48940494191015782), ('PctSameCity85', 0.48474757015134257)]


2.b) iii.	How do these results compare with your results from decision trees, above?

The Precision and Recall of the samples by SVC method is higher than the results from decision tree method.

But the Accuracy value is much lower than the desicion tree classification.

We are using a linear kernel svm with L2 regularization which is not the highly efficient way nor the complex way of classification. If the data set is in such way that the linear plane can't split the data into various classes, the efficiency of this method is at stake. May be with a better kernel (non linear) the accuracy may be improved.

# <u>Regression</u>

## <u>Linear Regression</u>
Directly predicting the Crime rate per capita.

3.a) ii.Calculating the estimated mean squared error by Linear Regression model (training on all the sample data and then testing on all of them).

In [376]:
lm_full = LinearRegression()
lm_full.fit(X, ybin)

ypredict=lm_full.predict(X)

print("Mean Squared Error:",end="")
print(np.mean((ypredict-ybin)**2))


Mean Squared Error:0.115663412575


3.a) i. Calculating the mean squared error for 10 fold cross validation

In [377]:
lm = LinearRegression()
print("10 Fold Cross Val MSE:",end="")
print(np.mean(cross_val_score(lm_full, X, ybin, cv=10, scoring='neg_mean_squared_error'))*-1)

10 Fold Cross Val MSE:0.132441066083


3.a) iii. Finding the features that are most predictive of high crime rate.

In [378]:
print("best features(High Crime Rate):",end="")
print(sorted(list(zip(features,lm_full.coef_)),key=lambda x: -x[1])[:10])


best features(High Crime Rate):[('population', 3.8997444222376658), ('PersPerOccupHous', 1.196471116775389), ('medFamInc', 0.92585261186539869), ('PctRecImmig8', 0.74239608782952038), ('PctOccupMgmtProf', 0.72483250979751523), ('MalePctDivorce', 0.65950634306271172), ('RentHighQ', 0.64678614475478835), ('PctRecImmig5', 0.59612562572439343), ('PctHousOwnOcc', 0.51592477449600938), ('FemalePctDiv', 0.5031651871844991)]


In [379]:
print("best features(Low Crime Rate):",end="")
print(sorted(list(zip(features,lm_full.coef_)),key=lambda x: x[1])[:10])

best features(Low Crime Rate):[('numbUrban', -2.7818491648335746), ('PctRecImmig10', -1.2374823539527968), ('PctKids2Par', -1.2159286728821308), ('medIncome', -1.0131426320284938), ('pctWInvInc', -0.93624836910933107), ('NumIlleg', -0.84323649082412888), ('TotalPctDiv', -0.77387187167161109), ('PctRecentImmig', -0.58270110254631569), ('MedRent', -0.57298131417125142), ('PctBSorMore', -0.47860448618568341)]


## <u>Ridge Regression</u>

3.b) i.Calucating the mean squared error using RidgeCV model under 10-fold CV.

In [380]:
rdcv=RidgeCV(alphas=(10, 1, 0.1, 0.01, 0.001),cv=10)

print("10 Fold Cross Val MSE:",end="")
print(np.mean(cross_val_score(rdcv, X, ybin, cv=10, scoring='neg_mean_squared_error'))*-1)

10 Fold Cross Val MSE:0.131520953897


3.b) ii. Calucating the mean squared error using RidgeCV model by training on all the sample data and testing on all of them.

In [381]:
rdcv.fit(X,ybin)
rd=Ridge(alpha=rdcv.alpha_)
rd.fit(X,ybin)
ypredict=rd.predict(X)
print("Mean Squared Error on training set:",end="")
print(np.mean((ypredict-ybin)**2))

Mean Squared Error on training set:0.117495628407


3.b) iii. Calculating the Best Alpha among the given values using RidgeCV model.

In [382]:
print("Best alpha:",rdcv.alpha_)

Best alpha: 1


3.b) iv.	What does this say about the amount of overfitting in linear regression for this problem?

A regularized linear regression model is Ridge Regression. This adds the L2 norm of the coefficients to the ordinary least squares objective. If alpha is 0, the coefficients won't be penalized and is similar to linear regression. 

AS value of alpha increases, the model complexity reduces. Though higher values of alpha reduce overfitting, significantly high values can cause underfitting as well. Since our alpha is not between 0 and 1, the coefficient weights get decreased i.e. it reduces overfitting.

OVERFITTING REDUCED.............!

## <u>Polynomial Features</u>

3.c) i. Calculating the mean squared error using Polynomial Regression model under 10-fold CV.

In [383]:
poly=PolynomialFeatures(2)
X_new=poly.fit_transform(X,ybin)


lm_full=LinearRegression()
lm_full.fit(X_new,ybin)
ypredict==lm_full.predict(X_new)

lm=LinearRegression()
print("10 Fold Cross Val MSE:",end="")
print(np.mean(cross_val_score(lm, X_new, ybin, cv=10, scoring='neg_mean_squared_error'))*-1)

10 Fold Cross Val MSE:1.24362859175


3.c) ii. Calucating the mean squared error using Polynomial Regression model by training on all the sample data and testing on all of them.

In [384]:
print("Mean Squared Error:",end="")
print(np.mean((ypredict.flatten()-ybin)**2))

Mean Squared Error:0.117495628407


The MSE under 10-fold CV for Polynomial Regression model is greater than that of the Linear Model. This means that for the given sample of data Linear Model is better as compared to the Polynomial Regression.

The MSE value of training on all the samples and testing on all of them is greater for polynomial regression. The polynomial regression with degree 2 tend to overfit the data. So the linear model is better in this case.


# <u>DIRTY DATA</u>

In [385]:
crime_full = pd.read_csv('Crime Prediction Data/communities-crime-full.csv',sep=',', na_values=["?"])
highCrime=[]
true_cnt=0
for x in crime_full['ViolentCrimesPerPop']:
    if x>0.1:
        highCrime.append('True')
        true_cnt+=1
    else:
        highCrime.append('False')
crime_full['highCrime']=pd.Series(highCrime)
crime_full.to_csv('Crime Prediction Data/communities-crime-full.csv', sep=',', encoding='utf-8')

In [386]:
features = list(crime_full.columns[:len(crime_full.columns)-2])

In [387]:
idx=0
comm={}
community_name=[]
for x in crime_full['communityname']:
    if x not in comm:
        comm[x]=idx
        idx+=1
    community_name.append(comm[x])
crime_full['communityname']=pd.Series(community_name)

Saving the labels, gini values, samples and vaues into ddt.dot file.

Plotting the decision tree from the data saved in dt.dot file on the basis of gini index and information gain.

In [388]:
y = crime_full["highCrime"]
X = crime_full[features]
X.is_copy = False


# filling the missing values with mean
fill_NaN = Imputer(missing_values=np.nan, strategy='mean')
X = pd.DataFrame(fill_NaN.fit_transform(X))
# fill the missing values with zero commented
# X.fillna(0,inplace=True)

dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
dt.fit(X, y)

with open("ddt.dot", 'w') as f:
    export_graphviz(dt, out_file=f,feature_names=features)


command = ["dot", "-Tpng", "ddt.dot", "-o", "ddt.png"]
try:
    subprocess.check_call(command)
except:
    exit("Could not run dot, ie graphviz, to produce visualization")


Calculating the training Accuracy, Precision, and Recall for the plotted decision tree.

In [389]:
ypredict=dt.predict(X)

TP=0
TN=0
FP=0
FN=0

for x in zip(ypredict,y):
    if(x[0]=='True' and x[1]=='True'):
        TP+=1
    elif x[0]=='False' and x[1]=='False':
        TN+=1
    elif x[0]=='True' and x[1]=='False':
        FP+=1
    elif x[0]=='False' and x[1]=='True':
        FN+=1

acc=(TP + TN)/(TP + TN + FP + FN)
pre=TP/(TP + FP)
rec=TP/(TP + FN)

print('accuracy:',acc)
print('precision:',pre)
print('recall:',rec)


accuracy: 0.9433299899699097
precision: 0.9473270440251572
recall: 0.9632294164668266


Finding the main features used for the decision tree classification.

The features with higher importance value are treated as the main features.

We are printing the top 10 features that contribute the most in decision making.

In [390]:
print(sorted(list(zip(features,dt.feature_importances_)),key=lambda x: -x[1])[:10])

[('PctKids2Par', 0.42698574130639683), ('racePctWhite', 0.10414025388676976), ('racePctHisp', 0.054317738929504394), ('communityname', 0.021353588305276783), ('HousVacant', 0.020309976203017808), ('PctEmplManu', 0.014656348313214289), ('blackPerCap', 0.014527159561124818), ('HispPerCap', 0.012827799541958205), ('PctHousOwnOcc', 0.012141343625738019), ('PctNotHSGrad', 0.012111657460089353)]


Calculating the 10-fold cross validation Accuracy, Precision and Recall for the above decision tree.

In [391]:
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
print('accuracy:',end="")
print(np.mean(cross_val_score(dt, X, y, cv=10, scoring='accuracy')))
ybin=[]
for x in y:
    if x=='True':
        ybin.append(1)
    else:
        ybin.append(0)
print('precision:',end="")
print(np.mean(cross_val_score(dt, X, ybin, cv=10, scoring='precision')))
print('recall:',end="")
print(np.mean(cross_val_score(dt, X, ybin, cv=10, scoring='recall')))

accuracy:0.762255281382
precision:0.821636589773
recall:0.79453968254


The CV results for Accuracy, Precision and Recall are greater for Dirty data(full file) than the communities-crime-clean.csv file even though there are many missing values in the communities-crime-full.csv file.

This happens because in this method we are imputing the missing values with the mean values of the feature. 

Since the mean values are almost higher than the original sample values, the CV results on this file gives better result in terms of Accuracy, Precision and Recall.

The count of missing values in few of the columns is comparatively low against the entire dataset and it makes very little effect on training the model. If the missing data is sufficiently high, the model efficiency would be clearly affected.

# Team Extras - 2 Members

## K nearest neighbor

This is one of the classification we learnt in the course work. The euclidean distance between samples is considered as the similarity measure between two records in this model. Here we implement KNN with n(neighbors) values as 20

In [392]:
features_clean = list(crime_clean.columns[:len(crime_clean.columns)-2])
features_full = list(crime_full.columns[:len(crime_full.columns)-2])
X_clean = crime_clean[features_clean]
y_clean= crime_clean["highCrime"]
ybin_clean=[]
for x in y_clean:
    if x=='True':
        ybin_clean.append(1)
    else:
        ybin_clean.append(0)

X_full = crime_full[features]
y_full = crime_full["highCrime"]
X_full.is_copy = False
ybin_full=[]
for x in y_full:
    if x=='True':
        ybin_full.append(1)
    else:
        ybin_full.append(0)


# filling the missing values with mean
fill_NaN = Imputer(missing_values=np.nan, strategy='mean')
X_full = pd.DataFrame(fill_NaN.fit_transform(X_full))


#fitting for clean file
knn_clean=KNeighborsClassifier(n_neighbors=20)
print("Clean file")
print('accuracy:',end="")
print(np.mean(cross_val_score(knn_clean, X_clean, ybin_clean, cv=10, scoring='accuracy')))
print('precision:',end="")
print(np.mean(cross_val_score(knn_clean, X_clean, ybin_clean, cv=10, scoring='precision')))
print('recall:',end="")
print(np.mean(cross_val_score(knn_clean, X_clean, ybin_clean, cv=10, scoring='recall')))



#fitting for full file
knn_full=KNeighborsClassifier(n_neighbors=20)
print("Full file")
print('accuracy:',end="")
print(np.mean(cross_val_score(knn_full, X_full, ybin_full, cv=10, scoring='accuracy')))
print('precision:',end="")
print(np.mean(cross_val_score(knn_full, X_full, ybin_full, cv=10, scoring='precision')))
print('recall:',end="")
print(np.mean(cross_val_score(knn_full, X_full, ybin_full, cv=10, scoring='recall')))

Clean file
accuracy:0.283376884422
precision:0.405180128929
recall:0.3384
Full file
accuracy:0.673008550214
precision:0.733024607316
recall:0.75373968254


## Linear Discriminant Analysis

The basic idea of LDA is, for each class to be identified, calculate a linear function of the attributes. The class function yielding the highest score represents the predicted class. LDA bears some resemblance to principal components analysis (PCA) which made us choose this classifier for prediction. After projection of the data on the linear discriminant dimension, a classification threshold is placed at the midpoint between the two class means. This is equivalent to placing a decision hyperplane orthogonal to the discriminant dimension in response pattern space.

In [393]:
#fitting for clean file
warnings.filterwarnings('ignore')
lda_clean=LinearDiscriminantAnalysis()
print("Clean file")
print('accuracy:',end="")
print(np.mean(cross_val_score(lda_clean, X_clean, ybin_clean, cv=10, scoring='accuracy')))
print('precision:',end="")
print(np.mean(cross_val_score(lda_clean, X_clean, ybin_clean, cv=10, scoring='precision')))
print('recall:',end="")
print(np.mean(cross_val_score(lda_clean, X_clean, ybin_clean, cv=10, scoring='recall')))



#fitting for full file
lda_full=LinearDiscriminantAnalysis()
print("Full file")
print('accuracy:',end="")
print(np.mean(cross_val_score(lda_full, X_full, ybin_full, cv=10, scoring='accuracy')))
print('precision:',end="")
print(np.mean(cross_val_score(lda_full, X_full, ybin_full, cv=10, scoring='precision')))
print('recall:',end="")
print(np.mean(cross_val_score(lda_full, X_full, ybin_full, cv=10, scoring='recall')))

Clean file
accuracy:0.787716080402
precision:0.838948220569
recall:0.8344
Full file
accuracy:0.822964074102
precision:0.859766972476
recall:0.858520634921


ii.	What method gives the best results?
   
   The linear discriminant analysis have better accuracy , precision and recall over the K nearest neighbor. So LDA is best classifier

iii.What feature(s) seem to be most consistently predictive of high crime rates? How reliable is this conclusion?

In [394]:
lda_clean.fit(X_clean,ybin_clean)
lda_full.fit(X_full,ybin_full)


print("Best features in clean file")
print(sorted(list(zip(features_clean,np.absolute(lda_clean.coef_[0]))),key=lambda x: -x[1])[:10])

print("Best features in full file")
print(sorted(list(zip(features_full,np.absolute(lda_full.coef_[0]))),key=lambda x: -x[1])[:10])

Best features in clean file
[('population', 33.682483458062947), ('numbUrban', 24.027110069840454), ('PctRecImmig10', 10.688259127697631), ('PctKids2Par', 10.502097823910718), ('PersPerOccupHous', 10.334040961529821), ('medIncome', 8.7506144631829628), ('pctWInvInc', 8.0864710070066419), ('medFamInc', 7.99668180968537), ('NumIlleg', 7.2831180913914313), ('TotalPctDiv', 6.6840089231070472)]
Best features in full file
[('population', 34.304785589750345), ('LemasSwFTPerPop', 33.299263703943552), ('PolicPerPop', 27.560157633147412), ('numbUrban', 25.259002184021409), ('LemasSwFTFieldOps', 12.361383605515076), ('LemasSwornFT', 11.638960241056196), ('PersPerOccupHous', 10.999772290326963), ('PctRecImmig10', 10.626951764114063), ('PctKids2Par', 10.197234171221925), ('medIncome', 8.737217384745037)]


The weight coefficient of features provide the magnitude of relevance of itself in the orthogonal hyperplane. So the absolute value is the measure of its importance with respect to each other.

# <u>Extra Credit</u>

## <u>Random Forest</u>
Random forest is an extension of bagged decision trees. Bagging means building multiple models (typically of the same type) from different subsamples of the training dataset.

It is an ensemble methods in which a group of “weak learners,i.e, "decision trees", come together to form a “strong learner”,i.e, "random forest". Each classifier, individually, is a “weak learner,” while all the classifiers taken together are a “strong learner”. A random subset of  features are considered for each split.

In [395]:
rdm = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=20, 
min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)

print('accuracy:',end="")
print(np.mean(cross_val_score(rdm, X_clean, ybin_clean, cv=10, scoring='accuracy')))
print('precision:',end="")
print(np.mean(cross_val_score(rdm, X_clean, ybin_clean, cv=10, scoring='precision')))
print('recall:',end="")
print(np.mean(cross_val_score(rdm, X_clean, ybin_clean, cv=10, scoring='recall')))

accuracy:0.797266331658
precision:0.828171562398
recall:0.8352


## <u>ADABOOST</u> 

In this method equal weights are assigned to all the training examples and a base algorithm is chosen. At each step of iteration, we apply the base algorithm to the training set and increase the weights of the incorrectly classified examples. We iterate n times, each time applying base learner on the training set with updated weights. The final model is the weighted sum of the n learners. This method provides a solution to the supervised classification learning task.

Boosting means building multiple models (typically of the same type) each of which learns to fix the prediction errors of a prior model in the chain. The default 50 estimators are used for classification.

In [396]:
adab = AdaBoostClassifier(n_estimators=50)

print('accuracy:',end="")
print(np.mean(cross_val_score(adab, X_clean, ybin_clean, cv=10, scoring='accuracy')))
print('precision:',end="")
print(np.mean(cross_val_score(adab, X_clean, ybin_clean, cv=10, scoring='precision')))
print('recall:',end="")
print(np.mean(cross_val_score(adab, X_clean, ybin_clean, cv=10, scoring='recall')))

accuracy:0.71348241206
precision:0.80440745523
recall:0.76


Clearly the Random forest classifier gives better accuracy, precision and recall values compared to ADABOOST. So Random Forest is the better method

In [397]:
adab.fit(X_clean,ybin_clean)
print("Best Features as per ADABOOST:")
print(sorted(list(zip(features_clean,adab.feature_importances_)),key=lambda x: -x[1])[:10])

Best Features as per ADABOOST:
[('racePctWhite', 0.10000000000000001), ('PctKids2Par', 0.080000000000000002), ('state', 0.040000000000000001), ('agePct12t21', 0.040000000000000001), ('pctWInvInc', 0.040000000000000001), ('blackPerCap', 0.040000000000000001), ('TotalPctDiv', 0.040000000000000001), ('PctTeen2Par', 0.040000000000000001), ('communityname', 0.02), ('householdsize', 0.02)]


In [398]:
rdm.fit(X_clean,ybin_clean)
print("Best Features as per Random Forest:")
print(sorted(list(zip(features_clean,rdm.feature_importances_)),key=lambda x: -x[1])[:10])

Best Features as per Random Forest:
[('PctIlleg', 0.11692400153579441), ('racePctWhite', 0.10719537972840823), ('PctFam2Par', 0.074748793355397275), ('PctYoungKids2Par', 0.068519771510293259), ('PctPersDenseHous', 0.059557169163832878), ('FemalePctDiv', 0.057884656497844925), ('PctKids2Par', 0.046253507605939416), ('racepctblack', 0.026920332443789628), ('medIncome', 0.023572072066926998), ('TotalPctDiv', 0.015738470436034116)]


The feature importance values specify the weights of each features which when sorted in descending order provides the features aligned in decreasing order of its importance for classification

## Merge clean data with temperature and military data

The merging of two dataset is done on state column. The primary data set is left joined to the tempandmil table to generate the new sample dataset.

In [399]:
tempandmil = pd.read_csv('tempandmilitary.csv',sep=',', na_values=[""])
tempandmil_feat=list(tempandmil.columns[1:len(tempandmil.columns)])
X_clean_new=X_clean.merge(tempandmil[tempandmil_feat],on='state',how='left')
fill_NaN = Imputer(missing_values=np.nan, strategy='mean')
X_clean_new = pd.DataFrame(fill_NaN.fit_transform(X_clean_new))

In [400]:
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
print('accuracy:',end="")
print(np.mean(cross_val_score(dt, X_clean_new, ybin_clean, cv=10, scoring='accuracy')))
print('precision:',end="")
print(np.mean(cross_val_score(dt, X_clean_new, ybin_clean, cv=10, scoring='precision')))
print('recall:',end="")
print(np.mean(cross_val_score(dt, X_clean_new, ybin_clean, cv=10, scoring='recall')))


accuracy:0.717962311558
precision:0.785802973192
recall:0.7664


Without the new data appended(from above)

accuracy:0.716487437186

precision:0.78942069483

recall:0.7648

In [401]:
dt.fit(X_clean_new, ybin_clean)
features_clean = list(crime_clean.columns[:len(crime_clean.columns)-2])
features_clean=features_clean+tempandmil_feat

print(sorted(list(zip(features_clean,dt.feature_importances_)),key=lambda x: -x[1])[:20])

[('PctKids2Par', 0.4462728762775991), ('racePctWhite', 0.11318607323731686), ('racePctHisp', 0.070631724672208643), ('communityname', 0.021033239187560431), ('PctSpeakEnglOnly', 0.017337167470066289), ('Air Force', 0.012953894867130522), ('HispPerCap', 0.012692628975730438), ('HousVacant', 0.012129605438407137), ('TotalPctDiv', 0.011829159070899707), ('Total Active Duty', 0.011758283512180626), ('PctYoungKids2Par', 0.01101534563750451), ('PctEmplManu', 0.010673474527515827), ('PctVacantBoarded', 0.010512322953415776), ('pctWFarmSelf', 0.010163430450656269), ('FemalePctDiv', 0.009669278225232756), ('state', 0.0085472482515365749), ('PctPopUnderPov', 0.0081378531728904802), ('PctWOFullPlumb', 0.0076534001004611198), ('PctSameState85', 0.0075994535912209515), ('PctHousOccup', 0.0073611773256511727)]


Air Force , Total Active Duty columns are top features for the decision tree created