In [1]:
#importing and loading all dependencies- functions, packages 

import pandas as pd 
import numpy as np 
import io 
import gensim, logging 

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO) 

#Credits: getLOS function from co-intern Abhishek Patil

def getLOS( df, const_cols=None, only_cols=None, skip_cols=None ):
    # if there are columns to skip
    if skip_cols is not None:
        if set(skip_cols)<set(df.columns.tolist()):
            df = df.drop(skip_cols, axis=1)
            
    # if only specific columns are needed
    if only_cols is not None:
        df = df[only_cols]
    
    # if no constant columns specified then use one row as a sentence
    if const_cols is None:
        # define list of sentences
        listOfSentences = []
        
        # get all the rows of data frame as generator object
        rowsGen = df.iterrows()
        
        # keep adding to this list a sentence(i.e. a row) for every loop
        while True:
            nextRow = next(rowsGen, None)
            if nextRow == None:
                break
            listOfSentences.append([ str(elem) for elem in nextRow[1].values ])
            
    # else use the constant columns for each sentence
    else:
        # define list of sentences
        listOfSentences = [[]]
        
        # get column names from data frame
        dfColNames = list(df.columns)
        numDFCols = len(dfColNames)
        
        # number of constant columns
        numCCols = len(const_cols)
        
        # don't progress if const_cols doesn't
        #    1. have column names as in the df's column names
        #  & 2. have atleast 1 less column names than df's column names
        # condition 1 check
        if ( len( [elem for elem in const_cols if elem not in dfColNames] ) != 0 ):
            print( "Column names in const_cols parameter not found in provided data frame!" )
            return           
        # and condition 2 check
        if ( ( numDFCols-numCCols ) <= 1 ):
            print( "To make sentences, have atleast 1 column not in the const_cols parameter!" )
            return
        
        # get the column indices which are in df's column names but not in const_cols
        # need these indices to make the sentences
        colsLeft = np.setdiff1d(dfColNames, const_cols)        
        cols_left_ind = sorted([ dfColNames.index(elem) for elem in colsLeft ])
        
        # also get indices for const_cols
        const_cols_ind = sorted([ dfColNames.index(elem) for elem in const_cols ])
        
        # sort the data frame according to the const_cols
        # by default ascending is true
        df = df.sort_values(const_cols)
        
        # get all the rows of data frame as generator object
        rowsGen = df.iterrows()
        
        # iterator to go through the loop for indexing
        itr = 0
        
        # define the previous values of constant columns in a list using the first values
        prevColsVal = [ df[colName].values[0] for colName in const_cols ]
        
        # get a sentence for each different month of an year
        while True:
            nextRow = next(rowsGen, None)
            # break when end of data frame's rows
            if nextRow == None:
                break
            
            # get the row values
            rowValues = list( nextRow[1].values )
            
            # get the current values for columns in const_cols
            # basically index the values from the row we are currently in
            curColsVal = [rowValues[ind] for ind in const_cols_ind]
            
            # if previous and current list values are same add to existing sentence
            if (prevColsVal == curColsVal):
                listOfSentences[itr].extend( [ str(rowValues[ind]) for ind in cols_left_ind ] )
            # else add a new sentence            
            else:
                listOfSentences.append( [ str(rowValues[ind]) for ind in cols_left_ind ] )            
                # increment iterator
                itr = itr + 1
        
            # store current values as previous ones for the next iteration
            prevColsVal = curColsVal
            
    # return a dictionary object of the data frame and the list of sentences
    retDObj = { 'DF': df,
                'LOS': listOfSentences }
        
    return retDObj

In [2]:
#Function to split date/time into date and time in separate columns 

def TimeGenerator(Stamp):
    NewTime=''
    i=8
    if (Stamp[8]!=' '):
        while (i<len(Stamp)):
            NewTime=NewTime+Stamp[i]
            i+=1 
    
    if (Stamp[8]==' ' and Stamp[9]!=' '):
        i+=1
        while (i<len(Stamp)):
            NewTime=NewTime+Stamp[i]
            i+=1
    return NewTime 

print (TimeGenerator('12/22/10 9:30')) 
                
#Our time generator function successfully separates the time from the date/time stamp 

9:30


In [3]:
#Setting up the dataframe for Crime2Vec 

df=pd.read_csv("Desktop/SFtrain.csv", header=0, delimiter=",")  
df.drop(df.columns[[2, 5, 6,7, 8]], axis=1, inplace=True) 
#df['Category'].replace(['WARRANTS', 'OTHER OFFENSES', 'LARCENY/THEFT', 'VEHICLE THEFT', 'VANDALISM',
#'NON-CRIMINAL', 'ROBBERY', 'ASSAULT', 'WEAPON LAWS', 'BURGLARY',
#'SUSPICIOUS OCC', 'DRUNKENNESS', 'FORGERY/COUNTERFEITING', 'DRUG/NARCOTIC',
#'STOLEN PROPERTY', 'SECONDARY CODES', 'TRESPASS', 'MISSING PERSON', 'FRAUD',
#'KIDNAPPING', 'RUNAWAY', 'DRIVING UNDER THE INFLUENCE',
#'SEX OFFENSES FORCIBLE' ,'PROSTITUTION', 'DISORDERLY CONDUCT', 'ARSON',
#'FAMILY OFFENSES', 'LIQUOR LAWS' ,'BRIBERY', 'EMBEZZLEMENT', 'SUICIDE',
#'LOITERING' ,'SEX OFFENSES NON FORCIBLE', 'EXTORTION', 'GAMBLING',
#'BAD CHECKS', 'TREA', 'RECOVERED VEHICLE', 'PORNOGRAPHY/OBSCENE MAT'],
                                              
#['VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'VIOLENT', 'VIOLENT', 'VIOLENT', 'VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'VIOLENT', 'NON-VIOLENT', 'VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT', 'NON-VIOLENT'], inplace= True)

Time=[] 

for i in range(len(df['Dates'])):
    Vals=TimeGenerator(df['Dates'][i]) 
    Time.append(Vals) 
    
newDF=pd.DataFrame({'Time': Time}) #List has been converted to a dataframe 
    
df['Time']=newDF.values
df.drop(df.columns[[0,2]], axis=1, inplace= True)
df.tail() 

Unnamed: 0,Category,PdDistrict,Time
878044,ROBBERY,TARAVAL,:15
878045,LARCENY/THEFT,INGLESIDE,:01
878046,LARCENY/THEFT,SOUTHERN,:01
878047,VANDALISM,SOUTHERN,:01
878048,FORGERY/COUNTERFEITING,BAYVIEW,:01


In [4]:
#Applying Crime2Vec to the San Francisco Kaggle data set 

train=df.sample(frac=0.8,random_state=200)
test=df.drop(train.index)

F=getLOS(train,const_cols=None, only_cols=None, skip_cols=None)  #This gives us the list of sentences to be fed into the word2vec model.   
TrainingSentences= F['LOS']   #storing the list of sentences value portion of the dictionary object returned by getLOS 
print (TrainingSentences[1]) 
print (test.head()) 

#TrainingSentences are now ready to be fed into word2vec model for training 

['VANDALISM', 'BAYVIEW', '8:30']
          Category PdDistrict   Time
0         WARRANTS   NORTHERN  23:53
2   OTHER OFFENSES   NORTHERN  23:33
8    LARCENY/THEFT   RICHMOND  23:00
9    LARCENY/THEFT    CENTRAL  23:00
13   LARCENY/THEFT   NORTHERN  22:06


In [5]:
#Train Crime2Vec 

from gensim.models import Word2Vec

model = gensim.models.Word2Vec(min_count=5, window=500, size=300, workers=3, sg=1, negative=5)       # an empty model, no training
model.build_vocab(TrainingSentences)  #1-pass step to build vocab
print ("Vocabulary successfully built!")  
model.train(TrainingSentences, total_examples=model.corpus_count,epochs=model.iter) #1 pass step to train model

#The model's vocabulary has been built and the model has now been trained on the corpus as well 

2017-09-12 13:07:10,206 : INFO : collecting all words and their counts
2017-09-12 13:07:10,208 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-09-12 13:07:10,226 : INFO : PROGRESS: at sentence #10000, processed 30000 words, keeping 1373 word types
2017-09-12 13:07:10,239 : INFO : PROGRESS: at sentence #20000, processed 60000 words, keeping 1512 word types
2017-09-12 13:07:10,258 : INFO : PROGRESS: at sentence #30000, processed 90000 words, keeping 1537 word types
2017-09-12 13:07:10,281 : INFO : PROGRESS: at sentence #40000, processed 120000 words, keeping 1546 word types
2017-09-12 13:07:10,317 : INFO : PROGRESS: at sentence #50000, processed 150000 words, keeping 1546 word types
2017-09-12 13:07:10,352 : INFO : PROGRESS: at sentence #60000, processed 180000 words, keeping 1546 word types
2017-09-12 13:07:10,389 : INFO : PROGRESS: at sentence #70000, processed 210000 words, keeping 1547 word types
2017-09-12 13:07:10,423 : INFO : PROGRESS: at sentence #

Vocabulary successfully built!


2017-09-12 13:07:13,007 : INFO : PROGRESS: at 8.73% examples, 427799 words/s, in_qsize 4, out_qsize 1
2017-09-12 13:07:14,011 : INFO : PROGRESS: at 17.46% examples, 428800 words/s, in_qsize 4, out_qsize 2
2017-09-12 13:07:15,014 : INFO : PROGRESS: at 26.19% examples, 429147 words/s, in_qsize 5, out_qsize 0
2017-09-12 13:07:16,021 : INFO : PROGRESS: at 33.97% examples, 417405 words/s, in_qsize 6, out_qsize 0
2017-09-12 13:07:17,056 : INFO : PROGRESS: at 41.76% examples, 407833 words/s, in_qsize 6, out_qsize 0
2017-09-12 13:07:18,061 : INFO : PROGRESS: at 48.49% examples, 395153 words/s, in_qsize 4, out_qsize 1
2017-09-12 13:07:19,068 : INFO : PROGRESS: at 57.03% examples, 398568 words/s, in_qsize 6, out_qsize 0
2017-09-12 13:07:20,081 : INFO : PROGRESS: at 65.67% examples, 401442 words/s, in_qsize 4, out_qsize 1
2017-09-12 13:07:21,097 : INFO : PROGRESS: at 74.02% examples, 402043 words/s, in_qsize 4, out_qsize 2
2017-09-12 13:07:22,105 : INFO : PROGRESS: at 82.28% examples, 402260 word

4940792

In [6]:
tester=test.copy() 
tester.drop(tester.columns[[0]], axis=1, inplace=True) 

G=getLOS(tester,const_cols=None, only_cols=None, skip_cols=None)  #This gives us the list of sentences to be fed into the word2vec model.   
TestSentences=G['LOS'] 
print (TestSentences[1])  

#TestingSentences are now ready to be fed into the accuracy tester for testing 

['NORTHERN', '23:33']


In [7]:
#Accuracy testing using predict_output_word- Gensim 

Counter=0 
Score=0
for i in range(0,len(TestSentences)):
    prediction=model.predict_output_word(TestSentences[i])
    Counter=Counter+1  
    if i==1:
        print (prediction) 
    if i==1000:
        print (prediction) 
    if i==20000: 
        print (prediction)  
    for j in range(len(prediction)):
        P=prediction[j]
        if (P[0]==(test['Category'].iloc[i])): 
            Score=Score+1
            if i==1: 
                print (test['Category'].iloc[i])   
print (Score) 
print (Counter) 
SampleSize=len(test['Category'])
Accuracy= (Score/SampleSize)*100 #Accuracy score is generated by dividing the number of matches by the number of cases 
Accuracy= "{0:.5f}".format(Accuracy)
Output = "The accuracy is " + Accuracy + "%." 
print (Output)            

[('PROSTITUTION', 0.0079189623), ('STOLEN PROPERTY', 0.0072257756), ('DRUNKENNESS', 0.0068500256), ('TRESPASS', 0.0068101063), ('DISORDERLY CONDUCT', 0.00622604), ('DRIVING UNDER THE INFLUENCE', 0.0055827266), ('WEAPON LAWS', 0.0052899709), ('LIQUOR LAWS', 0.0050747921), ('LOITERING', 0.0047253072), ('RECOVERED VEHICLE', 0.0045913667)]
[('RUNAWAY', 0.018371724), ('SEX OFFENSES FORCIBLE', 0.016582772), ('VEHICLE THEFT', 0.014349722), ('FRAUD', 0.013928068), ('MISSING PERSON', 0.013924303), ('BURGLARY', 0.01377563), ('SECONDARY CODES', 0.013284528), ('VANDALISM', 0.012490584), ('FORGERY/COUNTERFEITING', 0.012105169), ('KIDNAPPING', 0.01149694)]
[('FRAUD', 0.020855563), ('FORGERY/COUNTERFEITING', 0.019319382), ('EMBEZZLEMENT', 0.018094499), ('SEX OFFENSES FORCIBLE', 0.015223335), ('TRESPASS', 0.013438686), ('BAD CHECKS', 0.01330398), ('BURGLARY', 0.012498548), ('SECONDARY CODES', 0.011144218), ('STOLEN PROPERTY', 0.011113664), ('KIDNAPPING', 0.010372542)]
34904
175610
The accuracy is 19.8

In [None]:
#Accuracy testing using model.score()  

model.score(['VIOLENT NORTHERN 23:53'.split()]) 


In [58]:
model.score(['VIOLENT CENTRAL 23:00'.split()]) 

2017-09-10 13:03:56,374 : INFO : scoring sentences with 3 workers on 1512 vocabulary and 300 features, using sg=1 hs=1 sample=0.001 and negative=5
2017-09-10 13:03:56,376 : INFO : reached end of input; waiting to finish 1 outstanding jobs
2017-09-10 13:03:56,378 : INFO : scoring 1 sentences took 0.0s, 386 sentences/s


array([-23.42593193], dtype=float32)

In [59]:
model.score(['CRIME IS BAD'.split()]) 

2017-09-10 13:42:49,010 : INFO : scoring sentences with 3 workers on 1512 vocabulary and 300 features, using sg=1 hs=1 sample=0.001 and negative=5
2017-09-10 13:42:49,013 : INFO : reached end of input; waiting to finish 1 outstanding jobs
2017-09-10 13:42:49,014 : INFO : scoring 1 sentences took 0.0s, 471 sentences/s


array([ 0.], dtype=float32)

In [9]:
#Cosine similarity checks

print (model.similarity('LARCENY/THEFT', 'KIDNAPPING')) 
print (model.similarity('FRAUD', 'FRAUD')) 
print (model.similarity('ARSON', 'FRAUD')) 
print (model.similarity('EMBEZZLEMENT', 'FRAUD')) 

0.7964422806
1.0
0.42551629363
0.94444549444
