In [None]:
'''
In this module, we experiment with various machine learning techniques 
1. Base Estimators
    a. Random Forest
    b. KNN
    c. QDA
    d. SVC
    e. Naive Baeyes
2. Voting Algorithm
3. Boosting 
    a. ADA Boost
    b. Gradient Boosting
    c. XG Boost
    d. Hyper Parameter tuning


'''

In [35]:
#### Importing the libraries and the data
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from IPython.core.display import display, HTML
import re as re
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

from sklearn.cross_validation import KFold
display(HTML("<style>.container { width:100% !important; }</style>"))
import matplotlib.pyplot as plt
%matplotlib inline

# 1. Base Estimators

In [26]:
### Initial Processing
## Importing the dataset
train = pd.read_csv('../data/train_with_features.csv')
train = train.dropna()

In [65]:

## 
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer
features = ['num_victims', 'location_type', 'weekday', 'weekofyear',\
       'hourofday', 'workhour', 'sunlight', 'clustertype',\
       'crime_rate_LARCENY-FROM VEHICLE', 'crime_rate_LARCENY-NON VEHICLE',\
       'crime_rate_AUTO THEFT', 'crime_rate_ROBBERY-PEDESTRIAN',\
       'crime_rate_AGG ASSAULT', 'crime_rate_BURGLARY-RESIDENCE',\
       'crime_rate_BURGLARY-NONRES', 'crime_rate_ROBBERY-RESIDENCE',\
       'crime_rate_ROBBERY-COMMERCIAL', 'crime_rate_RAPE',\
        'crime_rate_HOMICIDE', 'count_crimes_lag_1',\
       'count_crimes_lag_7']


## Importing features
X = train[features].values
Y = train['Crime_Type']
## Encoding the target variable
le = LabelEncoder()
le.fit(Y)
Y = le.transform(Y)

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)



# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

def test_classifier(classifier,X_train,X_test,y_train,y_test):
    print ("")
    print ("===============================================")
    classifier_name = str(type(classifier).__name__)
    print ("Testing " + classifier_name)

    list_of_labels = sorted(list(set(y_train)))
    model = classifier.fit(X_train, y_train)

    predictions = model.predict(X_test)

    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, predictions,labels = list_of_labels)
    pred_df = pd.crosstab(y_test,predictions)

    recall = recall_score(y_test, predictions, average='weighted', pos_label=None, labels=list_of_labels)
    accuracy = accuracy_score(y_test, predictions)
    print ("=================== Results ===================")

    print ("Recall: {0:.2f}%".format(recall*100))
    print ("Accuracy: {0:.2f}%".format(accuracy*100))
    print ("===============================================")
    return (pred_df)

##  1a.Random Forest

In [66]:
## Runing random forest with default parameters
rf_clf = RandomForestClassifier()
rf_df = test_classifier(rf_clf,x_train,x_test,y_train,y_test)


Testing RandomForestClassifier
Recall: 45.14%
Accuracy: 45.14%


In [30]:
## tuning Random Forest
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf_clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# # Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] min_samples_split=5, bootstrap=True, max_features=sqrt, max_depth=30, n_estimators=400, min_samples_leaf=1 
[CV] min_samples_split=5, bootstrap=True, max_features=sqrt, max_depth=30, n_estimators=400, min_samples_leaf=1 
[CV] min_samples_split=5, bootstrap=True, max_features=sqrt, max_depth=30, n_estimators=400, min_samples_leaf=1 
[CV] min_samples_split=5, bootstrap=True, max_features=sqrt, max_depth=10, n_estimators=2000, min_samples_leaf=1 


KeyboardInterrupt: 

In [44]:
## Runing random forest with tuned parameters 
rf_clf = RandomForestClassifier(n_estimators = 200, criterion = 'entropy', random_state = 0,max_depth=10)
rf_df = test_classifier(rf_clf,x_train,x_test,y_train,y_test)


Testing RandomForestClassifier
Recall: 50.20%
Accuracy: 50.20%


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


##  1b.KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score

k_range = range(30,50)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, Y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())
print(" results of KNN: ",k_scores)
# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')

In [46]:
### Fit K from above to get the base KNN
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=33)
knn_df = test_classifier(knn_clf,x_train,x_test,y_train,y_test)


Testing KNeighborsClassifier
Recall: 44.00%
Accuracy: 44.00%


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## 1c. QDA

In [47]:

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda_clf = QuadraticDiscriminantAnalysis()
qda_df = test_classifier(qda_clf,x_train,x_test,y_train,y_test)


Testing QuadraticDiscriminantAnalysis
Recall: 26.78%
Accuracy: 26.78%


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## 1d. SVC

In [64]:

from sklearn.svm import SVC
svc_clf = SVC(kernel="linear", C=0.025)
svc_df = test_classifier(svc_clf,x_train,x_test,y_train,y_test)


Testing SVC
Recall: 42.03%
Accuracy: 42.03%


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## 1e Naive Baeyes

In [49]:

from sklearn.naive_bayes import GaussianNB
nb_clf = GaussianNB()
nb_df = test_classifier(nb_clf,x_train,x_test,y_train, y_test)




Testing GaussianNB
Recall: 16.11%
Accuracy: 16.11%


# 2. Voting Algorithm

#####  These classifiers perform good on different target types. For example Random Forest and SVC tend to predict the majority classes well. On the other hand techniques like KNN , QDA perform well on the minor classes.  So we can do a voting to select the best predictions out of all these classifiers. 

In [None]:
 ## checking Correlation for different Classifiers
import seaborn as sns
test_rf = pd.Series(rf_clf.predict(x_test),name ='RF')
test_qda = pd.Series(qda_clf.predict(x_test),name ='QDA')
test_knn = pd.Series(knn_clf.predict(x_test),name ='KNN')
test_svc = pd.Series(svc_clf.predict(x_test),name ='SVC')
test_nb = pd.Series(nb_clf.predict(x_test),name ='NB')

ensemble_results = pd.concat([test_rf,test_qda,test_knn,test_svc,test_nb],axis=1)
g= sns.heatmap(ensemble_results.corr(),annot=True)

In [None]:
## trying out voting using combinations of various classifiers
votingC = VotingClassifier(estimators=[('RF', rf_clf),
('QDA', qda_clf),('NB', nb_clf)], voting='soft', weights=[2,1,1], n_jobs=4)
vote_df = test_classifier(votingC,x_train,x_test,y_train,y_test)

In [None]:
## Tuning the voting parameters

import itertools as it   
df_five = pd.DataFrame(columns=('w1', 'w2', 'w3','w4','w5', 'accuracy'))
v_clfs = [('RF', rf_clf),('QDA', qda_clf),('NB', nb_clf),('SVC',svc_clf),('KNN',knn_clf)]
# combs = list(it.combinations(v_clfs, 3))+ list(it.combinations(v_clfs, 4)) + 
combs_5 = list(it.combinations(v_clfs, 5))




i = 0
for w1 in range(1,4):
    for w2 in range(1,4):
        for w3 in range(1,4):
            for w4 in range(1,4):
                for w5 in range(1,4):
                    for j,clf_comb in enumerate(combs_5):
                        if len(set((w1,w2,w3,w4,w5))) == 1: # skip if all weights are equal
                            continue   
                        eclf = VotingClassifier(estimators=list(clf_comb),voting= 'soft', weights=[w1,w2,w3,w4,w5])
                        model = eclf.fit(x_train, y_train)
                        predictions = model.predict(x_test)
                        accuracy = accuracy_score(y_test, predictions)
                        print (w1,w2,w3,w4,w5,accuracy)
                        df_five.loc[i] = [w1, w2, w3,w4,w5, accuracy]
                        i += 1

actual = pd.Series(y_test,name ='Actual')
voting_results = pd.Series(votingC.predict(x_test),name ='Voting')
ensemble_results = pd.concat([test_rf,test_qda,test_knn,test_svc,test_nb,actual,voting_results],axis=1)
ensemble_results.to_csv('../data/voting_results.csv')

# 3. Boosting Algorithms

## 3a. ADA Boosting

In [63]:
seed =7
adaboost = AdaBoostClassifier()

ada_df = test_classifier(adaboost,x_train,x_test,y_train, y_test)


Testing AdaBoostClassifier
Recall: 44.24%
Accuracy: 44.24%


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## 3b. Gradient Boosting Classifier

In [58]:
gbc = GradientBoostingClassifier(
    max_depth=1,
    n_estimators=500,
    warm_start=True,
    random_state=seed)

gbc_df = test_classifier(gbc,x_train,x_test,y_train, y_test)


Testing GradientBoostingClassifier
Recall: 51.37%
Accuracy: 51.37%


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## 3c. XG Boosting

In [57]:
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
xgb_clf = XGBClassifier()

xgb_clf_df = test_classifier(xgb_clf,x_train,x_test,y_train, y_test)


Testing XGBClassifier
Recall: 52.19%
Accuracy: 52.19%


  if diff:
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## 3d. Hyper Parameter Tuning

In [34]:
from sklearn.model_selection import cross_val_score

from xgboost import XGBClassifier
xgb_clf_1 = XGBClassifier(
                    learning_rate =0.05,\
                    n_estimators=150,max_depth=5,min_child_weight=5,reg_lambda=0.05)

scores = cross_val_score(xgb_clf_1, X,Y, cv=5, scoring='accuracy')
print (np.mean(scores))



KeyboardInterrupt: 

## Final Model

In [67]:

xgb_clf_1 = XGBClassifier(
                    learning_rate =0.05,\
                    n_estimators=150,max_depth=5,min_child_weight=5,reg_lambda=0.05)


xgb_clf_df = test_classifier(xgb_clf_1,x_train,x_test,y_train, y_test)


Testing XGBClassifier
Recall: 53.09%
Accuracy: 53.09%


  if diff:
