In [1]:
import random
import string
import pandas as pd
import numpy as np

In [85]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [74]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

## Loading the data

In [2]:
df = pd.read_csv('nlp_annotation.csv')

In [3]:
df.head()

Unnamed: 0,Incident,Summary,label
0,ALCOHOL Alcohol/Drug Overdose,A student was transported to a local hospital ...,3
1,DISTURBANCE Disturbing The Peace,A staff member reported that a security guard ...,1
2,PROPERTY Damaged Property,A non-USC male reported that the roof of his t...,1
3,THEFT-PETTY Theft Petty-Plain,A suspect removed the wheels from a bicycle.,2
4,TRAFFIC Traffic Collision Without Injuries,A non-USC male drove a cart into a light post ...,2


In [5]:
df = df.drop(['Incident'],axis=1)

In [6]:
df.head()

Unnamed: 0,Summary,label
0,A student was transported to a local hospital ...,3
1,A staff member reported that a security guard ...,1
2,A non-USC male reported that the roof of his t...,1
3,A suspect removed the wheels from a bicycle.,2
4,A non-USC male drove a cart into a light post ...,2


In [7]:
df_1K = df.iloc[:1000,:]

In [8]:
print np.shape(df)
print np.shape(df_1K)

(7316, 2)
(1000, 2)


In [10]:
print df['label'].value_counts()
print df_1K['label'].value_counts()

2    4670
3    1296
1     916
4     339
5      95
Name: label, dtype: int64
2    652
3    154
1    141
4     45
5      8
Name: label, dtype: int64


In [20]:
print len(df)
print len(df_1K)

7316

In [21]:
df = df.dropna()
len(df)

1000

In [22]:
df_1K = df_1K.dropna()
len(df_1K)

### BOW - Count Vectorization

In [38]:
count_vect = CountVectorizer()

In [39]:
X_train_counts = count_vect.fit_transform(df_1K['Summary'])

In [40]:
X_train_counts.shape

(997, 1428)

In [41]:
count_vect.vocabulary_.get(u'danger')

336

### TF-IDF

In [42]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

In [43]:
X_train_tf = tf_transformer.transform(X_train_counts)

In [44]:
X_train_tf.shape

(997, 1428)

In [45]:
type(X_train_tf)

scipy.sparse.csr.csr_matrix

In [46]:
X_train_tf[0]

<1x1428 sparse matrix of type '<type 'numpy.float64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [None]:
# Fit_Transform

In [47]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(997, 1428)

### Classification

In [49]:
clf = MultinomialNB().fit(X_train_tfidf, df_1K['label'])

In [66]:
docs_new = ['A suspect smashed the rear window to gain entry and ransacked the interior, but did not remove any property.']
#docs_new = ['A student reported her purse missing.']
#docs_new = ['He was found dead in a bomb and the police found him near the tunnel.']
#docs_new = ['Sithara stole a pencil from a friend.']
#docs_new = ['A suspect removed a department issued uniform, a pair of shoes and a knife.']

In [67]:
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

In [68]:
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, category))

'A suspect smashed the rear window to gain entry and ransacked the interior, but did not remove any property.' => 2


### Pipeline

In [70]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [72]:
text_clf.fit(df_1K['Summary'], df_1K['label'])

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [73]:
# Test
df_1K_test = df.iloc[1000:2000,:]
df_1K_test = df_1K_test.dropna()
print df_1K_test['label'].value_counts()

2    662
3    145
1    137
4     50
5      6
Name: label, dtype: int64


In [None]:
# Multinomial NB

In [75]:
docs_test = df_1K_test['Summary']
predicted = text_clf.predict(docs_test)
np.mean(predicted == df_1K_test['label']) 

0.74

In [None]:
# SVM

In [76]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
])

In [77]:
text_clf.fit(df_1K['Summary'], df_1K['label'])
print('Prediction')
predicted = text_clf.predict(docs_test)
np.mean(predicted == df_1K_test['label'])

Prediction


0.794

In [None]:
# Results

In [83]:
print metrics.classification_report(df_1K_test['label'], predicted)

             precision    recall  f1-score   support

          1       0.76      0.66      0.71       137
          2       0.81      0.94      0.87       662
          3       0.74      0.48      0.58       145
          4       0.36      0.10      0.16        50
          5       1.00      0.67      0.80         6

avg / total       0.77      0.79      0.77      1000



In [84]:
metrics.confusion_matrix(df_1K_test['label'], predicted)

array([[ 91,  39,   5,   2,   0],
       [ 21, 625,  12,   4,   0],
       [  5,  68,  69,   3,   0],
       [  2,  37,   6,   5,   0],
       [  0,   1,   1,   0,   4]], dtype=int64)

### Parameter Tuning

In [86]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

In [87]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)

In [88]:
gs_clf = gs_clf.fit(df['Summary'], df['label'])

In [89]:
gs_clf.predict(['A suspect smashed the rear window to gain entry and ransacked the interior, but did not remove any property.'])

array([2], dtype=int64)

In [90]:
gs_clf.best_score_  

0.8138215534208525

In [91]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [93]:
#gs_clf.cv_results_