In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+').tokenize

In [2]:
model_cols = ['request', 'food', 'shelter', 'water', 'medical_help', 'clothing', 'search_and_rescue']

In [3]:
train = pd.read_csv("./preprocessed/training.csv")
test = pd.read_csv("./preprocessed/test.csv")
validation = pd.read_csv("./preprocessed/validation.csv")

In [4]:
logis_clf = LogisticRegression(C=2, solver='lbfgs', max_iter=1500)

In [5]:
vectorizers = {
    # 'count' : CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,1), tokenizer = tokenizer),
    # 'count_multi' : CountVectorizer(lowercase=False, stop_words='english', ngram_range = (1,2), tokenizer = tokenizer, max_features=80000),
    'tfidf' : TfidfVectorizer(lowercase=True, strip_accents='unicode', ngram_range = (1,1), tokenizer = tokenizer),
    'tfidf_multi' : TfidfVectorizer(lowercase=True, strip_accents='unicode', ngram_range = (1,2), tokenizer = tokenizer, max_features=100000)
}

cols = ['message', 'message_stem', 'message_lemma']

training_features = {}

In [6]:
print("Training data features:")
for v in vectorizers:
    for col in cols:
        curr_vectorizer = vectorizers[v]
        key = v + '_' + col
        training_features[key] = curr_vectorizer.fit_transform(train[col])
        print(key + ":", len(curr_vectorizer.get_feature_names()))

Training data features:
tfidf_message: 31208
tfidf_message_stem: 21451
tfidf_message_lemma: 24824
tfidf_multi_message: 100000
tfidf_multi_message_stem: 100000
tfidf_multi_message_lemma: 100000


In [7]:
def training(x):
    # x is the feature matrix of training data (count or tfidf)
    
    classifiers = {}
    roc_auc_scores = {}

    for col in model_cols:
        y = train[col]
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=79)

        clf = logis_clf.fit(x_train, y_train)
        predicted = clf.predict_proba(x_test)[:, 1]
        score = metrics.roc_auc_score(y_test, predicted)

        classifiers[col] = clf
        roc_auc_scores[col] = score
    
    avg = 0
    for key in roc_auc_scores:
        print("Score for {}: ".format(key), roc_auc_scores[key])
        avg += roc_auc_scores[key]

    avg /= len(model_cols)
    print("Final Score: ", avg)

    return classifiers, roc_auc_scores

In [8]:
result_classifiers = {}

In [10]:
for type_model in training_features:
    print('\n' +type_model)
    classifier, scores = training(training_features[type_model])
    result_classifiers[type_model] = [classifier, scores]


tfidf_message
Score for request:  0.890395124485479
Score for food:  0.952514171330649
Score for shelter:  0.9322075454535048
Score for water:  0.9424076732350254
Score for medical_help:  0.8504895432705111
Score for clothing:  0.9179631291158098
Score for search_and_rescue:  0.7895612797324909
Final Score:  0.8965054952319242

tfidf_message_stem
Score for request:  0.8929661799958425
Score for food:  0.9533928887988452
Score for shelter:  0.9352379799191924
Score for water:  0.941698203488625
Score for medical_help:  0.8573648118356896
Score for clothing:  0.9155084559259197
Score for search_and_rescue:  0.8037308075355
Final Score:  0.8999856182142307

tfidf_message_lemma
Score for request:  0.8926734833871213
Score for food:  0.9531425426933757
Score for shelter:  0.9344066511123842
Score for water:  0.9417577972073601
Score for medical_help:  0.8508517733292139
Score for clothing:  0.912916659613008
Score for search_and_rescue:  0.7972525474110763
Final Score:  0.8975716363933627
