In [3]:
import numpy as np
import pandas as pd
from dateutil import parser
from datetime import datetime
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm
import xgboost as xgb

from sklearn.metrics import log_loss
import pickle

from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn import neighbors
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.feature_extraction.text import HashingVectorizer


from matplotlib import cm
import matplotlib.pyplot as mt
import matplotlib.image as mpimg
from pandas.plotting import scatter_matrix
import string
import re
from sklearn.preprocessing import Imputer,LabelEncoder,OneHotEncoder
import matplotlib.ticker as ticker

In [4]:
class SFOCrimeClass1(object):
    
    def __init__(self, trainFile, testFile):
        self.trainFile = trainFile
        self.testFile = testFile
        
        self.lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
        self.rf = RandomForestClassifier(n_estimators = 500, max_depth=15,oob_score =True)
        self.lgbm = lightgbm.LGBMClassifier(objective = 'multiclass',n_estimators = 200,max_depth = 12)
        self.xgb_model = None
        
        self.X_train = None
        self.y_train = None
        self.X_val = None
        self.y_val = None
        
        self.prob_val_lr = None
        self.prob_val_rf = None
        self.prob_val_lgbm = None
        self.prob_val_xgb = None
        
        self.prob_test_lr = None
        self.prob_test_rf = None
        self.prob_test_lgbm = None
        self.prob_test_xgb = None
        
        self.test_data = None
        self.result =  None
        self.testset_ID = None
        
    def trainingData(self):
        tr_dataset = pd.read_csv(self.trainFile,parse_dates=['Dates'])
        
        #Label Encoding of Crime Category
        cat_le            =  preprocessing.LabelEncoder()
        crimecat          =  cat_le.fit_transform(tr_dataset.Category)
        
        #Datetime parser
        tr_hour  = tr_dataset.Dates.dt.hour
        tr_year  = tr_dataset.Dates.dt.year

        #Applying PCA on X and Y cordinate
        xy_scaler_tr = preprocessing.StandardScaler()
        xy_scaler_tr.fit(tr_dataset[["X","Y"]])
        tr_dataset[["X","Y"]] = xy_scaler_tr.transform(tr_dataset[["X","Y"]])
        pca_tr = PCA(2)
        tr_dataset.loc[:,["X","Y"]]=pca_tr.fit_transform(pd.DataFrame(tr_dataset.loc[:,["X","Y"]]))
        tr_dataset["rot30_X"]  = (1.732/2)* tr_dataset["X"] + (1./2)* tr_dataset["Y"] 
        tr_dataset["rot30_Y"]  = (1.732/2)* tr_dataset["Y"] - (1./2)* tr_dataset["X"]
        tr_dataset["rot45_X"]  = .707* tr_dataset["Y"] + .707* tr_dataset["X"] 
        tr_dataset["rot45_Y"]  = .707* tr_dataset["Y"] - .707* tr_dataset["X"]
        tr_dataset["rot60_X"]  = (1./2)* tr_dataset["X"] + (1.732/2)* tr_dataset["Y"] 
        tr_dataset["rot60_Y"]  = (1./2)* tr_dataset["Y"] - (1.732/2)* tr_dataset["X"]
        tr_dataset["radial_r"] = np.sqrt( np.power(tr_dataset["Y"],2) + np.power(tr_dataset["X"],2) )
        
        #Extracting whther crime happening at juntion or in block
        tr_add           = tr_dataset["Address"]
        tr_add_ser       = tr_add.str.contains('.?of.?')
        #Onehot Encoding categorical features
        tr_hour_ser      = pd.get_dummies(tr_hour,prefix = "H")
        tr_year_ser      = pd.get_dummies(tr_year)
        tr_days_ser      = pd.get_dummies(tr_dataset.DayOfWeek)
        tr_pdistrict_ser = pd.get_dummies(tr_dataset.PdDistrict)
        
        trainingset      = pd.concat([tr_hour_ser,
                                      tr_year_ser,
                                      tr_days_ser,
                                      tr_pdistrict_ser,
                                      tr_add_ser,
                                      tr_dataset.X,tr_dataset.Y,
                                      tr_dataset.rot30_X,tr_dataset.rot30_Y,
                                      tr_dataset.rot45_X,tr_dataset.rot45_Y,
                                      tr_dataset.rot60_X,tr_dataset.rot60_Y,
                                      tr_dataset.radial_r],axis=1)
        #split data into train and validation
        seed = 7
        test_size = 0.2
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(trainingset, crimecat, 
                                                                              test_size=test_size, 
                                                                              random_state=seed)
   
    def testingData(self):
        te_dataset = pd.read_csv(self.testFile,parse_dates=['Dates'])
     
        #Datetime parsering from test data
        te_hour  = te_dataset.Dates.dt.hour
        te_year  = te_dataset.Dates.dt.year
        
        #Applying PCA on X and Y cordinates from test
        xy_scaler_te = preprocessing.StandardScaler()
        xy_scaler_te.fit(te_dataset[["X","Y"]])
        te_dataset[["X","Y"]] = xy_scaler_te.transform(te_dataset[["X","Y"]])
        pca_te =PCA(2)
        te_dataset.loc[:,["X","Y"]] = pca_te.fit_transform(pd.DataFrame(te_dataset.loc[:,["X","Y"]]))
        te_dataset["rot30_X"]  = (1.732/2)* te_dataset["X"] + (1./2)* te_dataset["Y"] 
        te_dataset["rot30_Y"]  = (1.732/2)* te_dataset["Y"] - (1./2)* te_dataset["X"]
        te_dataset["rot45_X"]  = .707* te_dataset["Y"] + .707* te_dataset["X"] 
        te_dataset["rot45_Y"]  = .707* te_dataset["Y"] - .707* te_dataset["X"]
        te_dataset["rot60_X"]  = (1./2)* te_dataset["X"] + (1.732/2)* te_dataset["Y"] 
        te_dataset["rot60_Y"]  = (1./2)* te_dataset["Y"] - (1.732/2)* te_dataset["X"]
        te_dataset["radial_r"] = np.sqrt( np.power(te_dataset["Y"],2) + np.power(te_dataset["X"],2) )
        #Extracting whther crime happening at juntion or in block
        te_add           = te_dataset["Address"]
        te_add_ser       = te_add.str.contains('.?of.?')
        #Onehot Encoding categorical features
        te_hour_ser      = pd.get_dummies(te_hour,prefix = "H")
        te_year_ser      = pd.get_dummies(te_year)
        te_days_ser      = pd.get_dummies(te_dataset.DayOfWeek)
        te_pdistrict_ser = pd.get_dummies(te_dataset.PdDistrict)
        
        testset          = pd.concat([te_hour_ser,
                                      te_year_ser,
                                      te_days_ser,
                                      te_pdistrict_ser,
                                      te_add_ser,
                                      te_dataset.X,te_dataset.Y,
                                      te_dataset.rot30_X,te_dataset.rot30_Y,
                                      te_dataset.rot45_X,te_dataset.rot45_Y,
                                      te_dataset.rot60_X,te_dataset.rot60_Y,
                                      te_dataset.radial_r],axis=1)
        
        self.test_data = testset
        self.testset_ID = te_dataset['Id']
        
    def data(self):
        self.trainingData()
        self.testingData()
#===========Logistic Regression================================================#
    def trainLogisticRegression(self):
        self.lr.fit(self.X_train,self.y_train)
        self.valLogisticRegression()

    def valLogisticRegression(self):
        self.prob_val_lr =  self.lr.predict_proba(self.X_val) 
        print("Logistic Regression logloss " + str(log_loss(self.y_val,self.prob_val_lr)))
    
    def testLogisticRegression(self):
        self.prob_test_lr = self.lr.predict(self.test_data)
#===========Random Forest======================================================#        
    def trainRandomForest(self):
        self.rf.fit(self.X_train,self.y_train)
        self.valRandomForest()

    def valRandomForest(self):
        self.prob_val_rf = self.rf.predict_proba(self.X_val)
        print("Random Forest logloss " + str(log_loss(self.y_val,self.prob_val_rf)))
    
    def testRandomForest(self):
        self.prob_test_rf = self.rf.predict(self.test_data)
#===========LightGBM============================================================#        
    def trainLGBM(self):
        self.lgbm.fit(self.X_train,self.y_train)
        self.valLGBM()

    def valLGBM(self):
        self.prob_val_lgbm = self.lgbm.predict_proba(self.X_val)
        print("LightGBM logloss " + str(log_loss(self.y_val,self.prob_val_lgbm)))
    
    def testLGBM(self):
        self.prob_test_rf = self.lgbm.predict(self.test_data)
#===========XGBoost=============================================================#
    def trainXGB(self):
        #setting parameter for xgboost
        param = {}
        param['booster']               = 'gbtree'
        param['objective']             = 'multi:softprob'
        param['num_class']             = 36
        param['eval_metric']           = 'mlogloss'
        # param['scale_pos_weight']    = 1.0
        param['eta']                   = .3
        # param['max_depth']           = 6
        param['bst:colsample_bytree']  = 0.6
        # param['gamma']               = 0.5
        # param['min_child_weight']    = .5
        param['max_delta_step']        = 1
        param['silent']                = 1
        param['early_stopping_rounds'] = 30
        param['nthread']               = 4
        param['verbose_eval']          = 1
        num_round = 200
        plst = list(param.items())
        
        #training model
        xgb_training   = xgb.DMatrix(self.X_train, label = self.y_train)
        xgb_valset     = xgb.DMatrix(self.X_val,label = self.y_val)
        watchlist      = [(xgb_training ,'train'),(xgb_valset,'validation')]
        self.xgb_model = xgb.train(plst,xgb_training, num_round,watchlist)
        self.valXGB()
        
    def valXGB(self):
        losses = []
        xgb_valset        = xgb.DMatrix(self.X_val,label = self.y_val)
        self.prob_val_xgb = self.xgb_model.predict(xgb_valset)
        ll = log_loss(self.y_val, self.prob_val_xgb)
        losses.append(ll)
        print("XGBoost logloss "+ str(np.mean(losses)))
        
    def testXGB(self):
        xgb_testset        = xgb.DMatrix(self.test_data)
        self.prob_test_xgb = xgb_model.predict(xgb_testset)

In [5]:
class SFOCrimeClass2(object):
    
    def __init__(self, trainFile, testFile):
        self.trainFile = trainFile
        self.testFile = testFile
        
        self.knn = neighbors.KNeighborsRegressor(n_neighbors=1200,n_jobs=500,weights='distance')
        self.kmeans = None
        self.xgb_model = None
        self.gnb = GaussianNB()
        self.bnb = BernoulliNB()
        self.gnbmix = GaussianNB()
        self.bnbmix = BernoulliNB()
        
        self.features1 = None
        self.features2 = None
        self.training = None
        self.validation = None
        self.valcrime = None
        self.traincrime = None
        
        self.hour = None
        self.dis_le = None
        self.crimelist = None
        
        self.prob_val_knn = None
        self.prob_val_gnb = None
        self.prob_val_bnb = None
        self.prob_val_bgnbmix = None
        
        self.prob_test_knn = None
        self.prob_test_lgbm = None
        self.prob_test_gnb = None
        self.prob_test_bnb = None
        self.prob_test_bgnbmix = None
        
        self.test_data_1 = None
        self.test_data = None
        self.K_test = None
        self.result =  None
        
    def trainingData(self):
        tr_dataset = pd.read_csv(self.trainFile,parse_dates=['Dates'])
        tr_data = tr_dataset
        #Label Encoding of Crime Category
        cat_le            =  preprocessing.LabelEncoder()
        crimecat          =  cat_le.fit_transform(tr_dataset.Category)
        self.crimelist    =  pd.get_dummies(tr_dataset.Category)
        self.dis_le       =  preprocessing.LabelEncoder()
        pddis             =  self.dis_le.fit_transform(tr_dataset.PdDistrict)
        tr_dataset["crime"] = crimecat
        #Datetime parser
        tr_hour  = tr_dataset.Dates.dt.hour
        tr_year  = tr_dataset.Dates.dt.year
        tr_minu = tr_dataset.Dates.dt.minute
        
        #Generating K-Mean
        
        kmeans               = KMeans(n_clusters=120,max_iter=300)
        df     = pd.DataFrame(pd.concat([pd.DataFrame(tr_dataset.X),pd.DataFrame(tr_dataset.Y)],axis=1))
        kmeans.fit(df)
        labels               = kmeans.predict(df)
        centroids            = kmeans.cluster_centers_
        tr_dataset["Klabels"]=labels
        self.kmeans = kmeans
        
        #Applying PCA on X and Y cordinate
        X = tr_dataset.X
        Y = tr_dataset.Y
        
        
        #Generating SVD of Address to use them as numerical values
        vectorizer = HashingVectorizer(n_features=5)
        vector = vectorizer.transform(tr_dataset.Address)
        svd=TruncatedSVD(n_components=2, random_state=42)
        tr_dataset["AddSVD"]=pd.DataFrame(svd.fit_transform(vector)).loc[:,1]
        
        #Onehot Encoding categorical features
        tr_hour_ser      = pd.get_dummies(tr_hour,prefix = "H")
        tr_days_ser      = pd.get_dummies(tr_dataset.DayOfWeek)
        tr_pdistrict_ser = pd.get_dummies(tr_dataset.PdDistrict)
        tr_min_ser       = pd.get_dummies(tr_minu)
        self.hour = tr_hour_ser
        # training set created for Naive Bayes and KNN 
        train_data       = pd.concat([tr_hour_ser,
                                      tr_days_ser,
                                      tr_pdistrict_ser,
                                      tr_dataset.AddSVD,
                                      tr_dataset.Klabels,
                                      tr_min_ser,
                                      X,Y,tr_dataset.crime],axis=1)
    
        tr_data["Hour"]       = tr_hour
        tr_data["Minutes"]    = tr_minu
        tr_data["X"]          = X
        tr_data["Y"]          = Y
        tr_data["PdDistrict"] = pddis

        tr_data = pd.concat([tr_data,self.crimelist],axis =1)
        #split data into train and validation
        seed = 7
        test_size = 0.2
        self.training, self.validation = train_test_split(train_data, test_size=test_size,random_state=seed)
        self.K_train,self.K_val = train_test_split(tr_data, test_size= test_size, random_state=seed)
   
    def testingData(self):
        te_dataset = pd.read_csv(self.testFile,parse_dates=['Dates'])
        test_KNN = te_dataset
        #Datetime parsering from test data
        te_hour  = te_dataset.Dates.dt.hour
        te_year  = te_dataset.Dates.dt.year
        te_min   = te_dataset.Dates.dt.minute
        pddis    = self.dis_le.transform(te_dataset.PdDistrict)
        # Computing K-Means on X and Y cordinates
        df     = pd.DataFrame(pd.concat([pd.DataFrame(te_dataset.X),pd.DataFrame(te_dataset.Y)],axis=1))
        labels               = self.kmeans.predict(df)
        centroids            = self.kmeans.cluster_centers_
        te_dataset["Klabels"]=labels
        
        #Applying PCA on X and Y cordinates from test
        X = te_dataset.X
        Y = te_dataset.Y
                
        #Generating SVD of Address to use it as numerical values
        vectorizer = HashingVectorizer(n_features=10)
        vector = vectorizer.transform(te_dataset.Address)
        svd=TruncatedSVD(n_components=2, random_state=42)
        te_dataset["AddSVD"]=pd.DataFrame(svd.fit_transform(vector)).loc[:,1]
        
        #Onehot Encoding categorical features
        te_hour_ser      = pd.get_dummies(te_hour,prefix = "H")
        te_days_ser      = pd.get_dummies(te_dataset.DayOfWeek)
        te_pdistrict_ser = pd.get_dummies(te_dataset.PdDistrict)
        te_min_ser       = pd.get_dummies(te_dataset.Dates.dt.minute)
        testset_ID = te_dataset['Id']
            
        # training set created for Naive Bayes and KNN 
        test_data_1       = pd.concat([te_hour_ser,
                                      te_days_ser,
                                      te_pdistrict_ser,
                                      te_dataset.AddSVD,
                                      te_dataset.Klabels,
                                      te_min_ser,
                                      X,Y],axis=1)
        self.test_data_1=test_data_1
        test_KNN["Hour"]       = te_hour
        test_KNN["Minutes"]    = te_min
        test_KNN["PdDistrict"] = pddis    
        self.K_test = test_KNN
        
    def data(self):
        self.trainingData()
        self.testingData()
#===========KNN========================================================#
    def trainKNN(self):
        features = ["Hour","Minutes","X","Y","PdDistrict"]
        self.knn.fit(self.K_train[features],self.K_train[list(pd.DataFrame(self.crimelist).columns)])
        self.valKNN()

    def valKNN(self):
        features = ["Hour","Minutes","X","Y","PdDistrict"]
        self.prob_val_knn =  self.knn.predict(self.K_val[features]) 
        print("KNN logloss " + str(log_loss(self.K_val[list(pd.DataFrame(self.crimelist).columns)],self.prob_val_knn)))
    
    def testKNN(self):
        features = ["Hour","Minutes","X","Y","PdDistrict"]
        self.prob_test_knn = self.knn.predict(self.K_test[features])
#===========Naive Bayes================================================#
    def trainNaiveBayes(self):
        self.features1 = ['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday','Wednesday', 'BAYVIEW', 
                     'CENTRAL', 'INGLESIDE', 'MISSION','NORTHERN', 'PARK', 'RICHMOND','SOUTHERN', 'TARAVAL',
                     'TENDERLOIN'] + [x for x in range(0, 59)]+ [x for x in (set(pd.DataFrame(self.hour).columns))]
        
        self.features2 = ['AddSVD','Klabels','X','Y']
        
        # training on GaussionNaiveBayes
        self.gnb.fit(self.training[self.features1+self.features2],self.training["crime"])
        # training on BernoulliNaiveBayes
        self.bnb.fit(self.training[self.features1+self.features2],self.training["crime"])
        # training on mixture of Bernoulli and Gaussian Naive Bayes
        self.bnbmix.fit(self.training[self.features1],self.training["crime"])
        self.gnbmix.fit(self.training[self.features2],self.training["crime"])
        self.valNaiveBayes()

    def valNaiveBayes(self):
        self.prob_val_gnb =  self.gnb.predict_proba(self.validation[self.features1+self.features2]) 
        print("Gaussian Naive Bayes logloss " + str(log_loss(self.validation["crime"],self.prob_val_gnb)))
#      def valBNaiveBayes(self):
        self.prob_val_bnb =  self.bnb.predict_proba(self.validation[self.features1+self.features2]) 
        print("Bernoulli Naive Bayes logloss " + str(log_loss(self.validation["crime"],self.prob_val_bnb)))
#      def valMixNaiveBayes(self):
        pos_prior = self.gnbmix.class_prior_
        predictedBNB = np.array(self.bnbmix.predict_proba(self.validation[self.features1]))
        predictedGNB = np.array(self.gnbmix.predict_proba(self.validation[self.features2]))
        self.prob_val_bgnbmix = (predictedBNB)* (predictedGNB)
        print("Bernoulli and Gaussian mix Naive Bayes logloss " + str(log_loss(self.validation["crime"],self.prob_val_bgnbmix)))
        
    def testNaiveBayes(self):
        self.prob_test_gnb = self.gnb.predict(self.test_data_1[self.features1+self.features2])
        self.prob_test_bnb = self.bnb.predict(self.test_data_1[self.features1+self.features2])
        predictedBNB = np.array(self.bnbmix.predict_proba(self.test_data_1[self.features1]))
        predictedGNB = np.array(self.gnbmix.predict_proba(self.test_data_1[self.features2]))
        self.prob_test_bgnbmix = (predictedBNB)* (predictedGNB)
        

In [6]:
class SFOVisualisation(object):
    
    def __init__(self, trainFile):
        self.train = pd.read_csv(trainFile,parse_dates=['Dates'])
        self.train["Year"] = self.train.Dates.dt.year
        self.train["Month"] = self.train.Dates.dt.month
        self.train["Date"] = self.train.Dates.dt.date
        self.train["Hour"] = self.train.Dates.dt.hour
        self.train.drop(columns=["Resolution"],axis=1,inplace=True)
        self.dataplot = self.train.copy()

    def categoryDistribution(self):
        f1 = self.train.groupby("Category")["Category"].count().sort_values(ascending=False)
        f1 = pd.DataFrame(f1)
        print(f1.head(10))
    
    def geoSpatialScatterPlot(self):
        label=LabelEncoder()
        self.dataplot.loc[:,"Category"]=label.fit_transform(self.dataplot.loc[:,"Category"])
        self.dataplot.loc[:,"DayOfWeek"]=label.fit_transform(self.dataplot.loc[:,"DayOfWeek"])
        self.dataplot = self.dataplot[self.dataplot.Y!=90]
        Catlabel = pd.DataFrame(list(set(self.train["Category"])))
        ax = self.dataplot.plot(kind='scatter', x='X', y='Y', alpha = 0.5, s = self.dataplot["DayOfWeek"], 
                                label= "Crime distribution across area", figsize=(13,9),c='Category', 
                                cmap=mt.get_cmap('jet'), colorbar=True,)
        ax.set_ylabel("Latitude", fontsize=14)
        ax.set_xlabel("Longitude", fontsize=14)
        mt.show()
    
    def pdDistrictVsCrime(self):
        PdD = list(set(self.dataplot.PdDistrict))
        Cat = pd.get_dummies(self.train.Category)
        dpd=pd.DataFrame(np.random.normal(0,1,(10,36)),columns=Cat.columns,index=PdD)
        for p in PdD:
               for ca in Cat.columns:
                s = "PdDistrict == '"+p+"' & Category == '"+ca+"'"
                d = self.train.query(s)  
                dpd.at[p,ca]=d.shape[0]
        ax=dpd.plot(kind='bar',rot =0, figsize=(20,13),alpha=0.9,stacked=True)
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1),ncol=6, fancybox=True, shadow=True)
        mt.ylabel("Crime Frequency", fontsize=14)
        mt.xlabel("PdDistrict", fontsize=14)
        mt.show()
        
    def hourVsCrime(self):
        Hr = list(set(self.dataplot.Hour))
        Cat = pd.get_dummies(self.train.Category)
        dhr = pd.DataFrame(np.random.normal(0,1,(24,36)),columns=Cat.columns,index=Hr)
        for h in Hr:
               for ca in Cat.columns:
                        s = "Hour == "+str(h)+" & Category == '"+ca+"'"
                        d = self.train.query(s)  
                        dhr.at[h,ca]=d.shape[0]
        ax = dhr.plot( figsize=(23,10),legend=False,alpha=0.9,colormap=cm.cubehelix)
        ax.set_ylabel("Crime Frequency")
        ax.set_xlabel("Hour")
        tick_spacing = 1
        ax.xaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))
        ax.legend(loc='upper center', bbox_to_anchor=(0.35, 1.05),
                  ncol=4, fancybox=True, shadow=True)
        mt.show()
        
    def modifiedHourVsCrime(self):
        mh = np.array([28,810,1016,1621,212])
        Cat = pd.get_dummies(self.train.Category)
        dmh=pd.DataFrame(np.random.normal(0,1,(5,36)),columns=Cat.columns,index=mh)
        for h in mh:
                for ca in Cat.columns:
                        if(h == 28 ): s="Hour >=2 & Hour<=8 & Category == '"+ca+"'"
                        if(h == 810): s="Hour >8 & Hour<=10 & Category == '"+ca+"'"
                        if(h == 1016 ): s="Hour>10 & Hour<=16 & Category == '"+ca+"'"
                        if(h == 1621): s="Hour>16 & Hour<=21 & Category == '"+ca+"'"
                        if(h== 212): s="(Hour>21 or Hour<2) & Category == '"+ca+"'"
                        d = self.train.query(s)  
                        dmh.at[h,ca]=d.shape[0]
                        
        ax=dmh.plot(kind='bar',rot =0, figsize=(20,13),alpha=0.9,stacked=True)
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1),ncol=6, fancybox=True, shadow=True)
        mt.ylabel("Crime Frequency", fontsize=14)
        mt.xlabel("Modified Hour", fontsize=14)
        mt.show() 
        
    def dayofWeekVsCrime(self):
        Week = list(set(self.train.DayOfWeek))
        Cat = pd.get_dummies(self.train.Category)
        dw=pd.DataFrame(np.random.normal(0,1,(7,36)),columns=Cat.columns,index=Week)
        for w in Week:
               for ca in Cat.columns:
                        s = "DayOfWeek == '"+w+"' & Category == '"+ca+"'"
                        d = self.train.query(s)  
                        dw.at[w,ca]=d.shape[0] 
        ax=dw.plot(kind='bar',rot =0, figsize=(20,13),alpha=0.9,stacked=True)
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1),ncol=6, fancybox=True, shadow=True)
        mt.ylabel("Crime Frequency", fontsize=14)
        mt.xlabel("DayOfWeek", fontsize=14)
        mt.show()
    
    def monthVsCrime(self):
        lmon = list(set(self.train.Dates.dt.month))
        Cat  = pd.get_dummies(self.train.Category)
        dmon = pd.DataFrame(np.random.normal(0,1,(12,36)),columns=Cat.columns,index=lmon)
        for p in lmon:
               for ca in Cat.columns:
                        s = "Month == "+str(p)+" & Category == '"+ca+"'"
                        d = self.train.query(s)  
                        dmon.at[p,ca]=d.shape[0]
        ax = dmon.plot(kind='bar',rot =0, figsize=(20,13),alpha=0.9,stacked=True)
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1),ncol=6, fancybox=True, shadow=True)
        mt.ylabel("Crime Frequency", fontsize=14)
        mt.xlabel("Month", fontsize=14)
        mt.show() 
    
    def corelationMatrix(self):
        tr_hour  = self.train.Dates.dt.hour
        tr_year  = self.train.Dates.dt.year
        tr_month = self.train.Dates.dt.month
        tr_dates = self.train.Dates.dt.day
        tr_add           = self.train["Address"]
        tr_add_ser       = tr_add.str.contains('.?of.?')
        #Label Encoding of Crime Category
        cat_le            =  preprocessing.LabelEncoder()
        crimecat          =  cat_le.fit_transform(self.train.Category)
        
        def getseason(mon):
            if(mon >= 12 or mon <= 2)   : return 1
            elif(mon >= 3 or mon <= 5)  : return 2
            elif(mon >= 6 or mon <= 8)  : return 3
            elif(mon >= 9 or mon <= 1)  : return 4
        tr_month_mapped   = pd.Series(preprocessing.scale(list(map(getseason, tr_month))))
        crimecat_ser = pd.Series(crimecat)

        pdist_lbe     =  preprocessing.LabelEncoder()
        pdist = pdist_lbe.fit_transform(self.train.PdDistrict)
        pdist = pd.Series(pdist);
        
        days_lbe     =  preprocessing.LabelEncoder()
        days = days_lbe.fit_transform(self.train.DayOfWeek)
        days = pd.Series(days)
        
        #Applying PCA on X and Y cordinates from training
        tr_dataset = self.train.copy()
        xy_scaler_tr = preprocessing.StandardScaler()
        xy_scaler_tr.fit(tr_dataset[["X","Y"]])
        tr_dataset[["X","Y"]] = xy_scaler_tr.transform(tr_dataset[["X","Y"]])
        pca_tr = PCA(2)
        tr_dataset.loc[:,["X","Y"]]=pca_tr.fit_transform(pd.DataFrame(tr_dataset.loc[:,["X","Y"]]))
        tr_dataset["rot30_X"]  = (1.732/2)* tr_dataset["X"] + (1./2)* tr_dataset["Y"] 
        tr_dataset["rot30_Y"]  = (1.732/2)* tr_dataset["Y"] - (1./2)* tr_dataset["X"]
         
        tr_dataset["rot45_X"]  = .707* tr_dataset["Y"] + .707* tr_dataset["X"] 
        tr_dataset["rot45_Y"]  = .707* tr_dataset["Y"] - .707* tr_dataset["X"]
         
        tr_dataset["rot60_X"]  = (1./2)* tr_dataset["X"] + (1.732/2)* tr_dataset["Y"] 
        tr_dataset["rot60_Y"]  = (1./2)* tr_dataset["Y"] - (1.732/2)* tr_dataset["X"]
        
        tr_dataset["radial_r"] = np.sqrt( np.power(tr_dataset["Y"],2) + np.power(tr_dataset["X"],2) )
        
        abc = pd.concat([crimecat_ser,tr_hour,
                         tr_year,tr_month,
                          tr_dates,
                          tr_add_ser,
                          tr_dataset.X,tr_dataset.Y,
                          tr_dataset.rot30_X,tr_dataset.rot30_Y,
                          tr_dataset.rot45_X,tr_dataset.rot45_Y,
                          tr_dataset.rot60_X,tr_dataset.rot60_Y,
                          tr_dataset.radial_r,pdist,days,
                          tr_month_mapped],axis=1)
        lbe     =  preprocessing.LabelEncoder()
        corelationmatrix = abc.apply(lbe.fit_transform).corr()
        fig, ax = mt.subplots(figsize=(20, 10))
        colormap = sns.diverging_palette(220, 10, as_cmap=True)
        #Generate Heat Map, allow annotations and place floats in map
        sns.heatmap(corelationmatrix, cmap=colormap, annot=True, fmt=".2f")
        mt.show()
        
    def mostCrimeLoc(self):
        top_addresses = self.train.Address.value_counts()[:20]
        mt.figure(figsize=(12, 8))

        pos = np.arange(len(top_addresses))
        mt.barh(pos, top_addresses.values)
        mt.yticks(pos, top_addresses.index);
    
    def crimeAndLocation(self):
        top_crimes = self.train.Category.value_counts()[:10]
        top_addresses = self.train.Address.value_counts()[:10]
        subset = self.train[self.train.Address.isin(top_addresses.index) & self.train.Category.isin(top_crimes.index)]
        addr_cross_cat = pd.crosstab(subset.Address, subset.Category)
        mt.figure(figsize=(10, 10))
        sns.heatmap(addr_cross_cat, linewidths=.5);

In [8]:
if __name__ == "__main__":
    
    train_data_name = 'train.csv'
    test_data_name  = 'test.csv'
#=====================Visualisation=================================#    
#     visual = SFOVisualisation(train_data_name)
#     visual.categoryDistribution()
#     visual.geoSpatialScatterPlot()
#     visual.pdDistrictVsCrime()
#     visual.hourVsCrime()
#     visual.modifiedHourVsCrime()
#     visual.dayofWeekVsCrime()
#     visual.monthVsCrime()
#     visual.corelationMatrix()
#     visual.mostCrimeLoc()
#     visual.crimeAndLocation()
#=====================Different Models=================================#    
#     model = SFOCrimeClass1(train_data_name,test_data_name)
#     model.data()
#     model.trainLogisticRegression()
#     model.testLogisticRegression()

#     model.trainRandomForest()
#     model.testRandomForest()
    
#     model.trainLGBM()
#     model.testLGBM()

#     model.trainXGB()
#     model.testXGB()
    model2 = SFOCrimeClass2(train_data_name,test_data_name)
    model2.data()
    model2.trainKNN()
    model2.testKNN()
    model2.trainNaiveBayes()
    model2.testNaiveBayes()

KNN logloss 26.874481131175887
Gaussian Naive Bayes logloss 34.455385273325696
Bernoulli Naive Bayes logloss 2.4912385307931935
Bernoulli and Gaussian mix Naive Bayes logloss 3.0829764960021624


In [5]:
def testUsingPickle(testfile):
        te_dataset = pd.read_csv(testFile,parse_dates=['Dates'])
     
        #Datetime parsering from test data
        te_hour  = te_dataset.Dates.dt.hour
        te_year  = te_dataset.Dates.dt.year
        
        #Applying PCA on X and Y cordinates from test
        xy_scaler_te = preprocessing.StandardScaler()
        xy_scaler_te.fit(te_dataset[["X","Y"]])
        te_dataset[["X","Y"]] = xy_scaler_te.transform(te_dataset[["X","Y"]])
        pca_te =PCA(2)
        te_dataset.loc[:,["X","Y"]] = pca_te.fit_transform(pd.DataFrame(te_dataset.loc[:,["X","Y"]]))
        te_dataset["rot30_X"]  = (1.732/2)* te_dataset["X"] + (1./2)* te_dataset["Y"] 
        te_dataset["rot30_Y"]  = (1.732/2)* te_dataset["Y"] - (1./2)* te_dataset["X"]
        te_dataset["rot45_X"]  = .707* te_dataset["Y"] + .707* te_dataset["X"] 
        te_dataset["rot45_Y"]  = .707* te_dataset["Y"] - .707* te_dataset["X"]
        te_dataset["rot60_X"]  = (1./2)* te_dataset["X"] + (1.732/2)* te_dataset["Y"] 
        te_dataset["rot60_Y"]  = (1./2)* te_dataset["Y"] - (1.732/2)* te_dataset["X"]
        te_dataset["radial_r"] = np.sqrt( np.power(te_dataset["Y"],2) + np.power(te_dataset["X"],2) )
        #Extracting whther crime happening at juntion or in block
        te_add           = te_dataset["Address"]
        te_add_ser       = te_add.str.contains('.?of.?')
        #Onehot Encoding categorical features
        te_hour_ser      = pd.get_dummies(te_hour,prefix = "H")
        te_year_ser      = pd.get_dummies(te_year)
        te_days_ser      = pd.get_dummies(te_dataset.DayOfWeek)
        te_pdistrict_ser = pd.get_dummies(te_dataset.PdDistrict)
        
        testset          = pd.concat([te_hour_ser,
                                      te_year_ser,
                                      te_days_ser,
                                      te_pdistrict_ser,
                                      te_add_ser,
                                      te_dataset.X,te_dataset.Y,
                                      te_dataset.rot30_X,te_dataset.rot30_Y,
                                      te_dataset.rot45_X,te_dataset.rot45_Y,
                                      te_dataset.rot60_X,te_dataset.rot60_Y,
                                      te_dataset.radial_r],axis=1)
        
        testset_ID = te_dataset['Id']
        filename = 'Modelfile.sav'
        loaded_model = pickle.load(open(filename, 'rb'))
        cat_probabilities = loaded_model.predict_proba(testset)
        print(cat_probabilities)