In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+').tokenize
model_cols = ['request', 'food', 'shelter', 'water', 'medical_help', 'clothing', 'search_and_rescue']

In [2]:
train = pd.read_csv("./preprocessed/training.csv")
test = pd.read_csv("./preprocessed/test.csv")
validation = pd.read_csv("./preprocessed/validation.csv")

In [3]:
# Using TFIDF multi stemmed model
vectorizer = TfidfVectorizer(lowercase=True, strip_accents='unicode', ngram_range = (1,2), tokenizer = tokenizer, max_features=100000)
features = vectorizer.fit_transform(train['message_stem'])

In [4]:
logis_clf = LogisticRegression(C=2, solver='lbfgs', max_iter=1500)

def training(x):
    # x is the feature matrix of training data (count or tfidf)
    
    classifiers = {}
    roc_auc_scores = {}

    for col in model_cols:
        y = train[col]
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=79)

        clf = logis_clf.fit(x_train, y_train)
        predicted = clf.predict_proba(x_test)[:, 1]
        score = metrics.roc_auc_score(y_test, predicted)

        classifiers[col] = clf
        roc_auc_scores[col] = score
    
    avg = 0
    for key in roc_auc_scores:
        print("Score for {}: ".format(key), roc_auc_scores[key])
        avg += roc_auc_scores[key]

    avg /= len(model_cols)
    print("Final Score: ", avg)

    roc_auc_scores['avg'] = avg
    return classifiers, roc_auc_scores

In [5]:
classifiers, scores = training(features)

Score for request:  0.8968167428834706
Score for food:  0.9555958817669951
Score for shelter:  0.9328439939204562
Score for water:  0.9454543477315239
Score for medical_help:  0.8602161306016926
Score for clothing:  0.9158690389531241
Score for search_and_rescue:  0.8011991065256759
Final Score:  0.901142177483277


In [16]:
for col in model_cols:
    request_classifier = classifiers[col]
    test_x = test['message_stem']
    test_y = test[col]

    test_input = vectorizer.transform(test_x)
    prediction = request_classifier.predict(test_input)
    print(col, metrics.accuracy_score(prediction, test_y))

request 0.808599695585997
food 0.8744292237442922
shelter 0.910958904109589
water 0.923896499238965
medical_help 0.9155251141552512
clothing 0.9843987823439878
search_and_rescue 0.9904870624048706
