# Crime Attribute Extraction App 

##  Part  1

Loading Dataseet

In [10]:
from CoNLL2Spacy import *

In [18]:
# Test File
file = open("data/Crimetest.txt", "r",encoding = "utf-8") 
valList = []
for line in file:
    valList.append(line[:-1])
valList[:15]

['French B-gpe',
 'government O',
 'has O',
 'condemned O',
 'remarks O',
 'by O',
 'a O',
 'top O',
 'Vatican B-geo',
 'official O',
 'linking O',
 'the O',
 'pedophile O',
 'scandal O',
 'in O']

In [12]:
TEST_DATA = conll2spacy(valList)

In [14]:
# Train File
file = open("data/Crimetrain.txt", "r",encoding = "utf-8") 
trainList = []
for line in file:
    trainList.append(line[:-1])
trainList[:15]

['Thousands O',
 'of O',
 'demonstrators O',
 'have O',
 'marched O',
 'through O',
 'London B-geo',
 'to O',
 'protest O',
 'the O',
 'war O',
 'in O',
 'Iraq B-geo',
 'and O',
 'demand O']

In [15]:
TRAIN_DATA = conll2spacy(trainList)

In [17]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import spacy
import numpy
import pickle


In [19]:
lang="en"
vectors_loc = "wiki-news-300d-1M.vec"
nlp = spacy.blank(lang)    
with open(vectors_loc, "rb") as file_:        
    header = file_.readline()        
    nr_row, nr_dim = header.split()        
    nlp.vocab.reset_vectors(width=int(nr_dim))        
    for line in file_:            
        line = line.rstrip().decode("utf8")            
        pieces = line.rsplit(" ", int(nr_dim))            
        word = pieces[0]            
        vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")            
        nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab   

In [20]:
text = "I lost my wallet in metro yesterday"    
doc = nlp(text)    
print("similarity btw",doc[0] , "and", doc[2]," :-", doc[0].similarity(doc[2]))

similarity btw I and my  :- 0.5550758


# Part  2

Steps involved :- 
1. Loading required library
2. Setting up the parameters for traning
3. Traning and saving the best model

In [39]:
import spacy
import random # random function for to remove bais in Traning Data

# for batch parsing 
from spacy.util import minibatch, compounding


# For evaluateing the model from testing set
from spacy.gold import GoldParse
from spacy.scorer import Scorer


In [40]:
def train_spacy(TRAIN_DATA,TEST_DATA,iterations,droprate = 0.5,modelName = "modelTrained"):

    lang="en"
    vectors_loc = "wiki-news-300d-1M.vec"
    modiner = spacy.blank(lang)    
    with open(vectors_loc, "rb") as file_:        
        header = file_.readline()        
        nr_row, nr_dim = header.split()        
        modiner.vocab.reset_vectors(width=int(nr_dim))        
        for line in file_:            
            line = line.rstrip().decode("utf8")            
            pieces = line.rsplit(" ", int(nr_dim))            
            word = pieces[0]            
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")            
            modiner.vocab.set_vector(word, vector)  # add the vectors to the vocab    
    if 'ner' not in modiner.pipe_names:
        ner = modiner.create_pipe('ner')
        modiner.add_pipe(ner, last=True)
     
    # setting up f1score
    f1score = 0.0000

    
    # add labels that will be involved in training 
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])
            

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in modiner.pipe_names if pipe != 'ner']
    with modiner.disable_pipes(*other_pipes):  # only train NER
        optimizer = modiner.begin_training()
        
        # --Iterations Starts--
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            #--Shuffling Traning Data--
            random.shuffle(TRAIN_DATA)
            losses = {}
            
                      
                    
            # batch Traning For better Training and Learning of model
            batches = minibatch(TRAIN_DATA, size=compounding(2.0, 16.0, 1.01))
            for batch in batches:
                texts, annotations = zip(*batch)
                modiner.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=droprate,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print(losses)
            
            
            # Evaluating the Current Model Score on test data
            results = evaluate(modiner, TEST_DATA)
            print("Current Score :-",results["ents_f"], "Precision  :-",results["ents_p"], "Recall  :-",results["ents_r"])
            

            
            # loading previous best saved model in start of traning 
            if f1score == 0.00:
                try:                    
                    pnlp = spacy.load(modelName)
                    result = evaluate(pnlp, TEST_DATA) # calling evaluate function 
                    f1score = result["ents_f"]
                except:
                    print("Previous Model not found")
                    
            print("Best Sccore :- ",f1score)
            print("------------------------------------")
            # finding out the best score
            if f1score < results["ents_f"]:
                f1score = results["ents_f"]
                
                # Save our trained Model if the score if grater than best score else no change in previous model
                modiner.to_disk(modelName)
                
    print("-----Best Model is Saved-----")
     


In [41]:
def evaluate(ner_model, examples):
    scorer = Scorer()
     
    #loading tags for each input and Evaluating them
    for input_, annotations in examples:
        tags = []
        # loading text
        doc_gold_text = ner_model.make_doc(input_)
        
        #loading all tags for that text
        for ent in annotations.get('entities'):
            tags.append(ent)
            
        # Evaluating the tags    
        gold = GoldParse(doc_gold_text, entities=tags)
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
        
        
    return scorer.scores


In [2]:
def loadNERModel(modelName = "modelTrained"):
    nlp = spacy.load(modelName)
    return nlp

In [3]:
def score(model,TEST_DATA):
    result = evaluate(model, TEST_DATA) # calling evaluate function 
    f1score = result["ents_f"]
    precision = result["ents_p"]
    recall = result["ents_r"]
    print("F1 score of Model is :-",f1score)
    print("Precision of Model is :-",precision)
    print("Recall of Model is :-",recall)

In [None]:
# traning the model with 100 iterations


train_spacy(TRAIN_DATA,TEST_DATA, 50,droprate = 0.55, modelName = "CrimeNER")

In [6]:
# loading the saved model
pnlp = loadNERModel("CrimeNER")

In [46]:
# calculating the score of the model
score(pnlp,TEST_DATA)

F1 score of Model is :- 86.47551444580847
Precision of Model is :- 87.0393931979447
Recall of Model is :- 85.91889476607976


In [8]:
from spacy import displacy


In [9]:
testcase = pnlp('he was killed due to harassment')
displacy.render(testcase, style='ent', jupyter=True)

In [10]:
testcase = pnlp('A woman was assault by two men near Shahdara')
displacy.render(testcase, style='ent', jupyter=True)

In [11]:
testcase = pnlp('My name is Pooja I was near Domino-pizza in Pinjore a man on bike snatched my purse')
displacy.render(testcase, style='ent', jupyter=True)

In [12]:
testcase = pnlp('My name is Amit I saw a woman killing a dog on 15th july near Kalujhanda')
displacy.render(testcase, style='ent', jupyter=True)

In [13]:
testcase = pnlp('A car hit me at Mansarovar Park at 5pm wraped')
displacy.render(testcase, style='ent', jupyter=True)

In [14]:
testcase = pnlp('Yesterday there was incident of theft near my house')
displacy.render(testcase, style='ent', jupyter=True)

In [15]:
testcase = pnlp('A man slapped a girl in the Sadar market')
displacy.render(testcase, style='ent', jupyter=True)

In [16]:
testcase = pnlp('  ')
displacy.render(testcase, style='ent', jupyter=True)



In [None]:
testcase = pnlp('  ')
displacy.render(testcase, style='ent', jupyter=True)

In [None]:
testcase = pnlp('  ')
displacy.render(testcase, style='ent', jupyter=True)

In [None]:
testcase = pnlp('  ')
displacy.render(testcase, style='ent', jupyter=True)