In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
import nltk

In [3]:
vcodes_df = pd.read_csv("vcodes.csv")

In [4]:
vcodes_df.head()

Unnamed: 0,code,description
0,V010,Pedestrian injured in collision with pedal cyc...
1,V011,Pedestrian injured in collision with pedal cyc...
2,V019,Pedestrian injured in collision with pedal cyc...
3,V020,Pedestrian injured in collision with two- or t...
4,V021,Pedestrian injured in collision with two- or t...


In [5]:
nltk.word_tokenize(vcodes_df.ix[0, 1])

['Pedestrian',
 'injured',
 'in',
 'collision',
 'with',
 'pedal',
 'cycle',
 ':',
 'Nontraffic',
 'accident']

In [6]:
vcodes_df['tokens'] = vcodes_df['description'].map(nltk.word_tokenize)

In [7]:
vcodes_df.head()

Unnamed: 0,code,description,tokens
0,V010,Pedestrian injured in collision with pedal cyc...,"[Pedestrian, injured, in, collision, with, ped..."
1,V011,Pedestrian injured in collision with pedal cyc...,"[Pedestrian, injured, in, collision, with, ped..."
2,V019,Pedestrian injured in collision with pedal cyc...,"[Pedestrian, injured, in, collision, with, ped..."
3,V020,Pedestrian injured in collision with two- or t...,"[Pedestrian, injured, in, collision, with, two..."
4,V021,Pedestrian injured in collision with two- or t...,"[Pedestrian, injured, in, collision, with, two..."


In [8]:
vcodes_df['pos_tags'] = vcodes_df['tokens'].map(nltk.tag.pos_tag)

In [9]:
vcodes_df.head()

Unnamed: 0,code,description,tokens,pos_tags
0,V010,Pedestrian injured in collision with pedal cyc...,"[Pedestrian, injured, in, collision, with, ped...","[(Pedestrian, JJ), (injured, VBN), (in, IN), (..."
1,V011,Pedestrian injured in collision with pedal cyc...,"[Pedestrian, injured, in, collision, with, ped...","[(Pedestrian, JJ), (injured, VBN), (in, IN), (..."
2,V019,Pedestrian injured in collision with pedal cyc...,"[Pedestrian, injured, in, collision, with, ped...","[(Pedestrian, JJ), (injured, VBN), (in, IN), (..."
3,V020,Pedestrian injured in collision with two- or t...,"[Pedestrian, injured, in, collision, with, two...","[(Pedestrian, JJ), (injured, VBN), (in, IN), (..."
4,V021,Pedestrian injured in collision with two- or t...,"[Pedestrian, injured, in, collision, with, two...","[(Pedestrian, JJ), (injured, VBN), (in, IN), (..."


In [10]:
pos_tag_dic = {'word': [], 'pos_tag': []}
for x in vcodes_df['pos_tags']:
    for word, tag in x:
        pos_tag_dic['word'].append(word.lower())
        pos_tag_dic['pos_tag'].append(tag)
words_df = pd.DataFrame(pos_tag_dic)

In [11]:
words_df.head()

Unnamed: 0,pos_tag,word
0,JJ,pedestrian
1,VBN,injured
2,IN,in
3,NN,collision
4,IN,with


In [12]:
words_df.groupby(['word', 'pos_tag']).size().order(ascending=False)

word                 pos_tag
in                   IN         1217
injured              VBD         870
accident             NN          676
vehicle              NN          621
or                   CC          600
:                    :           599
collision            NN          568
with                 IN          548
of                   IN          453
injured              VBN         373
nontraffic           JJ          292
traffic              JJ          292
occupant             NNP         271
transport            NN          258
motor                NN          235
person               NNP         194
unspecified          JJ          185
other                JJ          179
car                  NN          177
three-wheeled        JJ          171
heavy                JJ          171
van                  NN          169
pick-up              JJ          169
truck                NN          169
occupant             VBD         168
railway              NN          156
passenger

In [13]:
keywords = words_df.query('pos_tag in ("JJ", "NN", "NNP")').groupby(['word', 'pos_tag']).size().order(ascending=False)

In [14]:
nonkeywords = words_df.query('pos_tag not in ("JJ", "NN", "NNP")').groupby(['word', 'pos_tag']).size().order(ascending=False)

In [15]:
keywords.to_csv("keywords.csv", header=True)

In [16]:
nonkeywords.to_csv("nonkeywords.csv", header=True)