In [4]:
import pickle
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+').tokenize
model_cols = ['food', 'shelter', 'water']

In [5]:
train = pd.read_csv("./preprocessed/training.csv")
test = pd.read_csv("./preprocessed/test.csv")
validation = pd.read_csv("./preprocessed/validation.csv")

In [6]:
# Using TFIDF multi stemmed model
vectorizer = TfidfVectorizer(lowercase=True, strip_accents='unicode', ngram_range = (1,2), tokenizer = tokenizer, max_features=100000)
features = vectorizer.fit_transform(train['message_stem'])
print(features.shape)

(21042, 100000)


Use only features where request = 1

In [7]:
indexes = np.where(train.request == 1)[0].tolist()
stripped = features[indexes, :]

In [8]:
print(stripped.shape)

(3560, 100000)


Request classifier

In [9]:
request_classifier = SVC(C=3.0, kernel='linear')
clf = request_classifier.fit(features, train.request)

val_x = validation.message_stem
val_y = validation.request

val_input = vectorizer.transform(val_x)
prediction = clf.predict(val_input)

score = metrics.accuracy_score(prediction, val_y)
report = metrics.classification_report(prediction, val_y)

print(score)
print(report)

0.9004665629860031
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      2179
           1       0.66      0.71      0.69       393

    accuracy                           0.90      2572
   macro avg       0.80      0.82      0.81      2572
weighted avg       0.90      0.90      0.90      2572



Use only rows with request = 1

In [10]:
validation_stripped = validation[validation.request == 1]
train_stripped = train[train.request == 1]

print(train_stripped.shape)
print(validation_stripped.shape)

(3560, 8)
(421, 8)


Food classifier

In [11]:
food_classifier = SVC(C=3.0, kernel='linear')
clf = food_classifier.fit(stripped, train_stripped.food)

val_x = validation_stripped.message_stem
val_y = validation_stripped.food

val_input = vectorizer.transform(val_x)
prediction = clf.predict(val_input)

score = metrics.accuracy_score(prediction, val_y)
report = metrics.classification_report(prediction, val_y)

print(score)
print(report)
print(metrics.confusion_matrix(prediction, val_y))

0.9406175771971497
              precision    recall  f1-score   support

           0       0.97      0.93      0.95       274
           1       0.89      0.95      0.92       147

    accuracy                           0.94       421
   macro avg       0.93      0.94      0.94       421
weighted avg       0.94      0.94      0.94       421

[[256  18]
 [  7 140]]


Shelter classifier

In [12]:
shelter_classifier = SVC(C=3.0, kernel='linear')
clf = shelter_classifier.fit(stripped, train_stripped.shelter)

val_x = validation_stripped.message_stem
val_y = validation_stripped.shelter

val_input = vectorizer.transform(val_x)
prediction = clf.predict(val_input)

score = metrics.accuracy_score(prediction, val_y)
report = metrics.classification_report(prediction, val_y)

print(score)
print(report)
print(metrics.confusion_matrix(prediction, val_y))

0.8954869358669834
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       321
           1       0.77      0.80      0.78       100

    accuracy                           0.90       421
   macro avg       0.85      0.86      0.86       421
weighted avg       0.90      0.90      0.90       421

[[297  24]
 [ 20  80]]


Water classifier

In [13]:
water_classifier = SVC(C=3.0, kernel='linear')
clf = water_classifier.fit(stripped, train_stripped.water)

val_x = validation_stripped.message_stem
val_y = validation_stripped.water

val_input = vectorizer.transform(val_x)
prediction = clf.predict(val_input)

score = metrics.accuracy_score(prediction, val_y)
report = metrics.classification_report(prediction, val_y)

print(score)
print(report)
print(metrics.confusion_matrix(prediction, val_y))

0.9619952494061758
              precision    recall  f1-score   support

           0       0.99      0.96      0.98       355
           1       0.83      0.95      0.89        66

    accuracy                           0.96       421
   macro avg       0.91      0.96      0.93       421
weighted avg       0.97      0.96      0.96       421

[[342  13]
 [  3  63]]


In [14]:
final_obj = {
    "classifiers": {
        "request": request_classifier,
        "food": food_classifier,
        "shelter": shelter_classifier,
        "water": water_classifier
    },
    "vectorizer": vectorizer
}

In [15]:
with open('./pickles/model.pickle', 'wb') as file:
    pickle.dump(final_obj, file)

In [16]:
test_input = vectorizer.transform(["i love eating food and earthquakes"]) 
prediction = request_classifier.predict(test_input)
print(prediction)

[0]
