<a href="https://colab.research.google.com/github/shuwang127/NLP-Disaster-Tweets/blob/master/disaster_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preliminary

Set the root path for the current program file, as well as the data path and temporary file path.

In [0]:
rootPath = './drive/My Drive/Colab Notebooks/'
dataPath = rootPath + '/data/'
tempPath = rootPath + '/temp/'

Import python libraies.

In [110]:
import os
import re
import sys
import csv
import random
import nltk
import pandas as pd
import numpy as np
from random import choice
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.util import ngrams
from nltk import word_tokenize
from itertools import chain
from collections import defaultdict
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as torchdata
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load Data

Load data from .csv files.

In [0]:
def ReadCsvData():
    # validate temp path
    if not os.path.exists(tempPath):
        os.mkdir(tempPath)
    # read data from train.csv.
    dataTrain = pd.read_csv(dataPath + 'train.csv')
    print('[Info] Load %d training samples from %s/train.csv.' % (len(dataTrain), dataPath))
    # read data from test.csv.
    dataTest = pd.read_csv(dataPath + 'test_labeled.csv')
    print('[Info] Load %d testing samples from %s/test_labeled.csv.' % (len(dataTest), dataPath))
    # return
    return dataTrain, dataTest

Get keywords and vocabulary from training data.

In [0]:
def CreateVocabulary(dataTrain, dataTest):
    # pre-process the data.
    def Preprocess(data):
        # remove url
        pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        data = re.sub(pattern, '', data)
        # remove html special characters.
        pattern = r'&[(amp)(gt)(lt)]+;'
        data = re.sub(pattern, '', data)
        # remove independent numbers.
        pattern = r' \d+ '
        data = re.sub(pattern, ' ', data)
        # lower case capitalized words.
        pattern = r'([A-Z][a-z]+)'
        def LowerFunc(matched):
            return matched.group(1).lower()
        data = re.sub(pattern, LowerFunc, data)
        # remove hashtags.
        pattern = r'[@#]([A-Za-z]+)'
        data = re.sub(pattern, '', data)
        return data

    # remove stop words.
    def RemoveStop(data):
        dataList = data.split()
        for item in dataList:
            if item.lower() in stopwords.words('english'):
                dataList.remove(item)
        dataNew = " ".join(dataList)
        return dataNew

    # get tokens.
    def GetTokens(data):
        # use tweet tokenizer.
        tknzr = TweetTokenizer()
        tokens = tknzr.tokenize(data)
        tokensNew = []
        # tokenize at each punctuation.
        pattern = r'[A-Za-z]+\'[A-Za-z]+'
        for tk in tokens:
            if re.match(pattern, tk):
                subtokens = word_tokenize(tk)
                tokensNew = tokensNew + subtokens
            else:
                tokensNew.append(tk)
        return tokensNew

    # process tokens with stemming.
    def WithStem(tokens):
        porter = PorterStemmer()
        tokensStem = []
        for tk in tokens:
            tokensStem.append(porter.stem(tk))
        return tokensStem

    # keywords.
    keywdList = list(set(list(dataTrain['keyword'])))
    keywdDict = dict(zip(keywdList, range(len(keywdList))))
    dataTrain['keywd'] = dataTrain['keyword'].apply(lambda x: keywdDict[x])
    dataTest['keywd'] = dataTest['keyword'].apply(lambda x: keywdDict[x])
    # exist location info?
    def is_nan(x):
        return (x is np.nan or x != x)
    dataTrain['loc'] = dataTrain['location'].apply(lambda x: (0 if is_nan(x) else 1))
    dataTest['loc'] = dataTest['location'].apply(lambda x: (0 if is_nan(x) else 1))
    # find url number.
    pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    dataTrain['url'] = dataTrain['text'].apply(lambda x: len(re.findall(pattern, x)))
    dataTest['url'] = dataTest['text'].apply(lambda x: len(re.findall(pattern, x)))

    # if exist list.npz, load it.
    if os.path.exists(tempPath + 'list.npz'):
        print('[Info] Load text list (noStem/Stem) of train/test set from %s/list.npz.' % (tempPath))
        return np.load(tempPath + 'list.npz', allow_pickle = True)

    # process train list.
    listTrain = []
    listTrainStem = []
    # read the training data.
    for i in range(len(dataTrain)):
        # get the training data.
        data = dataTrain['text'][i]
        # preprocess the data.
        data = Preprocess(data)
        # remove stop words.
        data = RemoveStop(data)
        # get the tokens for the data.
        tokens = GetTokens(data)
        listTrain.append(tokens)
        # get the stemmed tokens for the data.
        tokensStem = WithStem(tokens)
        listTrainStem.append(tokensStem)
    # process test list.
    listTest = []
    listTestStem = []
    # read the testing data.
    for i in range(len(dataTest)):
        # get the testing data.
        data = dataTest['text'][i]
        # preprocess the data.
        data = Preprocess(data)
        # remove stop words.
        data = RemoveStop(data)
        # get the tokens for the data.
        tokens = GetTokens(data)
        listTest.append(tokens)
        # get the stemmed tokens for the data.
        tokensStem = WithStem(tokens)
        listTestStem.append(tokensStem)
    np.savez(tempPath + 'list.npz', listTrain=listTrain, listTrainStem=listTrainStem, listTest=listTest, listTestStem=listTestStem)
    print('[Info] Load text list (noStem/Stem) of train/test set from %s/list.npz.' % (tempPath))
    return np.load(tempPath + 'list.npz', allow_pickle = True)

# Feature Extraction

Extract features from the data.

In [0]:
def ExtractFeatures():
    return

# Classifiers


### Naive Bayes Classifier

Naive bayes classifier training process.

In [0]:
def NaiveBayesTrain():
    return

Naive bayes classifier testing process.

In [0]:
def NaiveBayesTest():
    return

### Logistic Regression Classifier

Logistic regression classifier training process.

In [0]:
def LogisticTrain():
    return

Logistic regression classifier testing process.

In [0]:
def LogisticTest():
    return

# Evaluation

Evaluate and output the experimental results.

In [0]:
def OutputEval():
    return

# Main Entrance.

The main function and the entrance.

In [119]:
def main():
    # info.
    print("-- AIT726 Project from Julia Jeng, Shu Wang, and Arman Anwar --")
    # load training and testing data.
    dataTrain, dataTest = ReadCsvData()
    # get keywords and vocabulary from training data.
    dlist = CreateVocabulary(dataTrain, dataTest)
    # extract training features and labels.
    # featTrain, labelTrain = ExtractFeatures(dataTrain, 'train')
    # train the model.
    # model = NaiveBayesTrain(featTrain, labelTrain)
    # extract testing features and labels.
    # featTest, labelTest = ExtractFeatures(dataTest, 'test')
    # test the model.
    # predTest = NaiveBayesTest(model, featTest)
    # evaluate.
    # accuracy, confusion = OutputEval(predTest, labelTest)
    # test the data
    return

if __name__ == "__main__":
    main()

-- AIT726 Project from Julia Jeng, Shu Wang, and Arman Anwar --
[Info] Load 7613 training samples from ./drive/My Drive/Colab Notebooks//data//train.csv.
[Info] Load 3263 testing samples from ./drive/My Drive/Colab Notebooks//data//test_labeled.csv.
[Info] Load text list (noStem/Stem) of train/test set from ./drive/My Drive/Colab Notebooks//temp//list.npz.
