In [77]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+').tokenize

In [78]:
model_cols = ['request', 'food', 'shelter', 'water', 'medical_help', 'medical_products', 'clothing', 'search_and_rescue']

In [79]:
train = pd.read_csv("./preprocessed/training.csv")
test = pd.read_csv("./preprocessed/test.csv")
validation = pd.read_csv("./preprocessed/validation.csv")

In [80]:
logis_clf = LogisticRegression(max_iter=1500)

In [81]:
vectorizers = {
    'count' : CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = tokenizer),
    'count_multi' : CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,2), tokenizer = tokenizer, max_features=80000),
    'tfidf' : TfidfVectorizer(lowercase=True, strip_accents='unicode', ngram_range = (1,1), tokenizer = tokenizer),
    'tfidf_multi' : TfidfVectorizer(lowercase=True, strip_accents='unicode', ngram_range = (1,2), tokenizer = tokenizer, max_features=100000)
}

cols = ['message', 'message_stem', 'message_lemma']

training_features = {}

In [82]:
print("Training data features:")
for v in vectorizers:
    for col in cols:
        curr_vectorizer = vectorizers[v]
        key = v + '_' + col
        training_features[key] = curr_vectorizer.fit_transform(train[col])
        print(key + ":", len(curr_vectorizer.get_feature_names()))

Training data features:
count_message: 37044
count_message_stem: 21213
count_message_lemma: 24538
count_multi_message: 80000
count_multi_message_stem: 80000
count_multi_message_lemma: 80000
tfidf_message: 31208
tfidf_message_stem: 21451
tfidf_message_lemma: 24824
tfidf_multi_message: 100000
tfidf_multi_message_stem: 100000
tfidf_multi_message_lemma: 100000


In [83]:
def training(x):
    # x is the feature matrix of training data (count or tfidf)
    
    classifiers = {}
    roc_auc_scores = {}

    for col in model_cols:
        y = train[col]
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=79)

        clf = logis_clf.fit(x_train, y_train)
        predicted = clf.predict_proba(x_test)[:, 1]
        score = metrics.roc_auc_score(y_test, predicted)

        classifiers[col] = clf
        roc_auc_scores[col] = score
    
    avg = 0
    for key in roc_auc_scores:
        print("Score for {}: ".format(key), roc_auc_scores[key])
        avg += roc_auc_scores[key]

    avg /= len(model_cols)
    print("Final Score: ", avg)

    return classifiers, roc_auc_scores

In [84]:
result_classifiers = {}

In [85]:
for type_model in result:
    print('\n' + type_model)
    classifier, scores = training(result[type_model])
    result_classifiers[type_model] = [classifier, scores]


count_message
Score for request:  0.8633460977149833
Score for food:  0.93789992064899
Score for shelter:  0.915268919808698
Score for water:  0.9275831919613727
Score for medical_help:  0.81327075986633
Score for medical_products:  0.8298822903009726
Score for clothing:  0.8743173469215012
Score for search_and_rescue:  0.7544430924266055
Final Score:  0.8645014524561816

count_message_stem
Score for request:  0.8716080925211678
Score for food:  0.9420640020766328
Score for shelter:  0.9231987725249444
Score for water:  0.9212453782243681
Score for medical_help:  0.8216741421007212
Score for medical_products:  0.8249720897479205
Score for clothing:  0.8727006483722978
Score for search_and_rescue:  0.7682593817660399
Final Score:  0.8682153134167616

count_message_lemma
Score for request:  0.8675082151598075
Score for food:  0.9400074180531989
Score for shelter:  0.9223471123071001
Score for water:  0.9170327112967072
Score for medical_help:  0.8165230175894653
Score for medical_produc