### **LIBRARIES**

In [10]:
import os           # os.path.join
import csv          # csv.reader
import numpy as np  # np.concatenate
import re           # re.sub
import pandas as pd  # pd.DataFrame 

#pd.get_dummies

from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer  # fit_transform

In [11]:
import nltk

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/ronkow/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### **GLOBAL CONSTANTS, DATA**

In [12]:
PUNC = '''!?.,;:$&*^~_"`(){}[]/\|<>=%+-'''  # exclude # and @ and '

STOP_WORDS = set(stopwords.words('english')) # returns a set of stop words
ADD_WORDS = {"i'm"}
STOP_WORDS = STOP_WORDS.union(ADD_WORDS)

In [13]:
DATA_DIR = "data/"

DATA_FILE_TEXT = os.path.join(DATA_DIR, "train_text9.csv")           # train_text9.csv is a SMALL DATASET used to test the code
DATA_FILE_KEYWORD = os.path.join(DATA_DIR, "train_keyword9.csv")     # train_keyword9.csv is a SMALL DATASET used to test the code

### **THESE TWO FUNCTIONS WILL NOT BE USED**

In [14]:
def file_to_string(filepath):
    with open(filepath) as f:
        s = f.read()      
    return s

In [15]:
def csv_to_list_of_lists(filepath):
    text_list = []
    with open(filepath) as f:
        for row in csv.reader(f):
            text_list.append(row)
    return text_list[1:]

In [116]:
# CHECK
s = file_to_string(DATA_FILE_TEXT)
print(s)

text_list1 = csv_to_list_of_lists(DATA_FILE_TEXT)
print(text_list1)

"text"
"Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all"
"Forest fire near La Ronge Sask. Canada"
"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"
"13,000 people receive #wildfires evacuation orders in California "
"Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school"
"#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires"
"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas"
"I'm on top of the hill and I can see a fire in the woods..."
"There's an emergency evacuation happening now in the building across the street"

[['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'], ['Forest fire near La Ronge Sask. Canada'], ["All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in 

### **ONE HOT ENCODE TEXT** (USING SMALL DATASET)

In [16]:
def csv_to_list(filepath):
    token_list = []
    with open(filepath) as f:
        for row in csv.reader(f):
            token_list.append(row[0])

    token_list = token_list[1:]  # exclude column label
    return token_list

In [17]:
# USE THIS IN FUNCTION list_of_token_lists()

def csv_to_list_of_strings(filepath):
    text_list = []
    with open(filepath) as f:
        for row in csv.reader(f):
            text_list.append(row[0])
    return text_list[1:]

In [18]:
# SAME FUNCTION AS IN 1_select_bagofwords.ipynb

def clean_doc_text(doc):
    """
    ARGUMENT: text (string)
    RETURN: list of tokens
    """
    doc = doc.replace('...',' ... ')
    doc = doc.replace("'",' ')
    for p in PUNC:
        doc = doc.replace(p,'')
  
    tokens = doc.split()                                             # returns a list of tokens
    tokens = [w.lower() for w in tokens]                             # convert all letters to lower case  
    tokens = [w for w in tokens if not w in STOP_WORDS]              # exclude stop words
    
    tokens = [w for w in tokens if not w.isdigit()]

    tokens = [porter.stem(w) for w in tokens]                        # stemming
    tokens = [w for w in tokens if len(w)>=2]                        # include only words with length >= 2
    
    return tokens                                                    # list of tokens

In [19]:
def clean_doc_text_bow(tokens):
    tokens = [w for w in tokens if w in token_list]                  # include only tokens in selected bag of words
    return tokens                                                    # list of tokens

In [20]:
def list_of_token_lists_text(text_list):
    text_token_list = []
    
    for x in text_list:
        y = clean_doc_text(x)
        y = clean_doc_text_bow(y)
        text_token_list.append(y)
    return text_token_list

### **RUN**

In [99]:
token_list = csv_to_list('data/tokens/train_text_token100.csv')
print(token_list[0:20])
print('')

text_list = csv_to_list_of_strings(DATA_FILE_TEXT)
print(text_list)
print('')

text_token_list = list_of_token_lists_text(text_list)
print(text_token_list)

['us', 'fire', 'evacu', 'peopl', 'california', 'got', 'caus', 'flood', 'see', 'emerg', 'build', 'come', 'get', 'live', 'day', 'car', 'crash', 'look', 'new', 'polic']

['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all', 'Forest fire near La Ronge Sask. Canada', "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected", '13,000 people receive #wildfires evacuation orders in California ', 'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school', '#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires', '#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas', "I'm on top of the hill and I can see a fire in the woods...", "There's an emergency evacuation happening now in the building across the street"]

[['us'], ['fire'], ['evacu'], ['peopl', 'evacu', 'california

In [100]:
onehot_multi_text = MultiLabelBinarizer()

onehot_text_nolabel = onehot_multi_text.fit_transform(text_token_list)
num_row, num_col = onehot_text_nolabel.shape

print(onehot_text_nolabel)
print(onehot_text_nolabel.shape)
print(type(onehot_text_nolabel))
print('')

text_labels = onehot_multi_text.classes_
text_labels = text_labels.reshape(1, num_col)

print(text_labels)
print(text_labels.shape)
print(type(text_labels))

[[0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0]
 [0 1 0 0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 1 0]
 [1 0 0 1 1 0 0 0 0 0 0]]
(9, 11)
<class 'numpy.ndarray'>

[['build' 'california' 'caus' 'emerg' 'evacu' 'fire' 'flood' 'got'
  'peopl' 'see' 'us']]
(1, 11)
<class 'numpy.ndarray'>


In [101]:
onehot_text = np.concatenate((text_labels,onehot_text_nolabel), axis=0)
print(onehot_text)

[['build' 'california' 'caus' 'emerg' 'evacu' 'fire' 'flood' 'got'
  'peopl' 'see' 'us']
 [0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0]
 [0 1 0 0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 1 0]
 [1 0 0 1 1 0 0 0 0 0 0]]


### **ONE HOT ENCODE KEYWORDS** (USING SMALL DATASET)

In [21]:
def clean_doc_keyword(doc):
    """
    ARGUMENT: text (string)
    RETURN: list of tokens
    """
    for p in PUNC:
        doc = doc.replace(p,'')
  
    tokens = doc.split()                                 # returns a list of tokens
    tokens = [w.lower() for w in tokens]                 # convert all characters to lower case  
    tokens = [re.sub(r'\d+', '_', w) for w in tokens]    # sub numbers with underscore
    tokens = ['kw_'+ w for w in tokens]
    return tokens                                        # list of tokens

In [103]:
# CHECK

s = file_to_string(DATA_FILE_KEYWORD)
print(s)
print('')

keyword_list1 = clean_doc_keyword(s)
print(keyword_list1)

"keyword"
"earthquake"
"forestfire"
"evacuation"
"wildfires"
"wildfires"
"wildfires"
"flood"
"fire"
"evacuation"


['kw_keyword', 'kw_earthquake', 'kw_forestfire', 'kw_evacuation', 'kw_wildfires', 'kw_wildfires', 'kw_wildfires', 'kw_flood', 'kw_fire', 'kw_evacuation']


In [22]:
def list_of_token_lists_keyword(keyword_list):
    keyword_token_list = []
    
    for x in keyword_list:
        y = clean_doc_keyword(x)
        keyword_token_list.append(y)
    return keyword_token_list

### **RUN**

In [105]:
keyword_list = csv_to_list_of_strings(DATA_FILE_KEYWORD)
print(keyword_list)
print('')

keyword_token_list = list_of_token_lists_keyword(keyword_list)
print(keyword_token_list)

['earthquake', 'forestfire', 'evacuation', 'wildfires', 'wildfires', 'wildfires', 'flood', 'fire', 'evacuation']

[['kw_earthquake'], ['kw_forestfire'], ['kw_evacuation'], ['kw_wildfires'], ['kw_wildfires'], ['kw_wildfires'], ['kw_flood'], ['kw_fire'], ['kw_evacuation']]


In [106]:
onehot_multi_keyword = MultiLabelBinarizer()
onehot_keyword_nolabel = onehot_multi_keyword.fit_transform(keyword_token_list)
num_row, num_col = onehot_keyword_nolabel.shape

print(onehot_keyword_nolabel)
print(onehot_keyword_nolabel.shape)
print(type(onehot_keyword_nolabel))
print('')

keyword_labels = onehot_multi_keyword.classes_
keyword_labels = keyword_labels.reshape(1, num_col)

print(keyword_labels)
print(keyword_labels.shape)
print(type(keyword_labels))

[[1 0 0 0 0 0]
 [0 0 0 0 1 0]
 [0 1 0 0 0 0]
 [0 0 0 0 0 1]
 [0 0 0 0 0 1]
 [0 0 0 0 0 1]
 [0 0 0 1 0 0]
 [0 0 1 0 0 0]
 [0 1 0 0 0 0]]
(9, 6)
<class 'numpy.ndarray'>

[['kw_earthquake' 'kw_evacuation' 'kw_fire' 'kw_flood' 'kw_forestfire'
  'kw_wildfires']]
(1, 6)
<class 'numpy.ndarray'>


In [107]:
onehot_keyword = np.concatenate((keyword_labels,onehot_keyword_nolabel), axis=0)
print(onehot_keyword)

[['kw_earthquake' 'kw_evacuation' 'kw_fire' 'kw_flood' 'kw_forestfire'
  'kw_wildfires']
 [1 0 0 0 0 0]
 [0 0 0 0 1 0]
 [0 1 0 0 0 0]
 [0 0 0 0 0 1]
 [0 0 0 0 0 1]
 [0 0 0 0 0 1]
 [0 0 0 1 0 0]
 [0 0 1 0 0 0]
 [0 1 0 0 0 0]]


### **CONCATENATE KEYWORD AND TEXT ONE HOT DATA**

In [110]:
onehot_train = np.concatenate((onehot_keyword,onehot_text), axis=1)
print(onehot_train)

[['kw_earthquake' 'kw_evacuation' 'kw_fire' 'kw_flood' 'kw_forestfire'
  'kw_wildfires' 'build' 'california' 'caus' 'emerg' 'evacu' 'fire'
  'flood' 'got' 'peopl' 'see' 'us']
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0]
 [0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0]]


### **CONVERT NUMPY ARRAY TO DATAFRAME**

In [111]:
train_df = pd.DataFrame(data=onehot_train[1:], columns=onehot_train[0,0:])
train_df

Unnamed: 0,kw_earthquake,kw_evacuation,kw_fire,kw_flood,kw_forestfire,kw_wildfires,build,california,caus,emerg,evacu,fire,flood,got,peopl,see,us
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0
6,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
7,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
8,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0


In [23]:
def dataframe_to_csv(dataframe, filepath):
    csvfile = dataframe.to_csv(filepath, encoding='utf-8', index=False)
    
def csv_to_dataframe(filepath):
    df = pd.read_csv(filepath)
    return df    

In [113]:
DATA_FILE_BOW = os.path.join(DATA_DIR, "bow/train_bow9.csv")

dataframe_to_csv(train_df, DATA_FILE_BOW)

### **NOW WE CREATE COMPLETE TRAINING DATASET**

In [24]:
DATA_FILE_TEXT = os.path.join(DATA_DIR, "train_text.csv")           
DATA_FILE_KEYWORD = os.path.join(DATA_DIR, "train_keyword.csv")
DATA_FILE_TARGET = os.path.join(DATA_DIR, "train_target.csv")

In [25]:
# USE LARGEST SET OF BAG OF WORDS (2944 TERMS)

token_list = csv_to_list('data/tokens/train_text_token4.csv')
print(token_list[0:10])
print('')

text_list = csv_to_list_of_strings(DATA_FILE_TEXT)
print(text_list[0:3])
print('')

text_token_list = list_of_token_lists_text(text_list)
print(text_token_list[0:3])


['reason', '#earthquak', 'may', 'allah', 'us', 'forest', 'fire', 'near', 'la', 'canada']

['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all', 'Forest fire near La Ronge Sask. Canada', "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"]

[['reason', '#earthquak', 'may', 'allah', 'us'], ['forest', 'fire', 'near', 'la', 'canada'], ['resid', 'ask', 'shelter', 'place', 'offic', 'evacu', 'shelter', 'place', 'order', 'expect']]


In [26]:
onehot_multi_text = MultiLabelBinarizer()

onehot_text_nolabel = onehot_multi_text.fit_transform(text_token_list)
num_row, num_col = onehot_text_nolabel.shape

print(onehot_text_nolabel)
print(onehot_text_nolabel.shape)
print('')

text_labels = onehot_multi_text.classes_
text_labels = text_labels.reshape(1, num_col)

print(text_labels)
print(text_labels.shape)
print(type(text_labels))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(7597, 2944)

[['#1' '#360wisenew' '#7' ... '\x89û÷polit' 'åè' 'åê']]
(1, 2944)
<class 'numpy.ndarray'>


In [27]:
onehot_text = np.concatenate((text_labels,onehot_text_nolabel), axis=0)
print(onehot_text)

[['#1' '#360wisenew' '#7' ... '\x89û÷polit' 'åè' 'åê']
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [28]:
keyword_list = csv_to_list_of_strings(DATA_FILE_KEYWORD)
print(keyword_list[0:10])
print('')

keyword_token_list = list_of_token_lists_keyword(keyword_list)
print(keyword_token_list[0:10])

['earthquake', 'forestfire', 'evacuation', 'wildfires', 'wildfires', 'wildfires', 'flood', 'fire', 'evacuation', 'tornado']

[['kw_earthquake'], ['kw_forestfire'], ['kw_evacuation'], ['kw_wildfires'], ['kw_wildfires'], ['kw_wildfires'], ['kw_flood'], ['kw_fire'], ['kw_evacuation'], ['kw_tornado']]


In [29]:
onehot_multi_keyword = MultiLabelBinarizer()
onehot_keyword_nolabel = onehot_multi_keyword.fit_transform(keyword_token_list)
num_row, num_col = onehot_keyword_nolabel.shape

print(onehot_keyword_nolabel)
print(onehot_keyword_nolabel.shape)
print('')

keyword_labels = onehot_multi_keyword.classes_
keyword_labels = keyword_labels.reshape(1, num_col)

print(keyword_labels)
print(keyword_labels.shape)
print(type(keyword_labels))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(7597, 227)

[['kw_ablaze' 'kw_accident' 'kw_aftershock' 'kw_airplane_accident'
  'kw_ambulance' 'kw_annihilated' 'kw_annihilation' 'kw_apocalypse'
  'kw_armageddon' 'kw_army' 'kw_arson' 'kw_arsonist' 'kw_attack'
  'kw_attacked' 'kw_avalanche' 'kw_battle' 'kw_bioterror'
  'kw_bioterrorism' 'kw_blaze' 'kw_blazing' 'kw_bleeding' 'kw_blew_up'
  'kw_blight' 'kw_blizzard' 'kw_blood' 'kw_bloody' 'kw_blown_up'
  'kw_body_bag' 'kw_body_bagging' 'kw_body_bags' 'kw_bomb' 'kw_bombed'
  'kw_bombing' 'kw_bridge_collapse' 'kw_buildings_burning'
  'kw_buildings_on_fire' 'kw_burned' 'kw_burning' 'kw_burning_buildings'
  'kw_bush_fires' 'kw_casualties' 'kw_casualty' 'kw_catastrophe'
  'kw_catastrophic' 'kw_chemical_emergency' 'kw_cliff_fall' 'kw_collapse'
  'kw_collapsed' 'kw_collide' 'kw_collided' 'kw_collision' 'kw_crash'
  'kw_crashed' 'kw_crush' 'kw_crushed' 'kw_curfew' 'kw_cyclon

In [30]:
onehot_keyword = np.concatenate((keyword_labels,onehot_keyword_nolabel), axis=0)
print(onehot_keyword)

[['kw_ablaze' 'kw_accident' 'kw_aftershock' ... 'kw_wreck' 'kw_wreckage'
  'kw_wrecked']
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [31]:
onehot_train = np.concatenate((onehot_keyword,onehot_text), axis=1)
print(onehot_train)

[['kw_ablaze' 'kw_accident' 'kw_aftershock' ... '\x89û÷polit' 'åè' 'åê']
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [32]:
train_df = pd.DataFrame(data=onehot_train[1:], columns=onehot_train[0,0:])
train_df

Unnamed: 0,kw_ablaze,kw_accident,kw_aftershock,kw_airplane_accident,kw_ambulance,kw_annihilated,kw_annihilation,kw_apocalypse,kw_armageddon,kw_army,...,ûïa,ûïthe,ûïwe,ûïwhen,ûò,ûó,û÷extrem,û÷polit,åè,åê
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7592,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7593,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7594,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7595,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### **APPEND TARGET COLUMN TO TRAINING SET**

In [45]:
train_target = csv_to_dataframe(DATA_FILE_TARGET)
train_target

Unnamed: 0,target
0,1
1,1
2,1
3,1
4,1
...,...
7592,1
7593,1
7594,1
7595,1


In [46]:
train_df_with_target = pd.concat([train_df, train_target], axis=1)
train_df_with_target

Unnamed: 0,kw_ablaze,kw_accident,kw_aftershock,kw_airplane_accident,kw_ambulance,kw_annihilated,kw_annihilation,kw_apocalypse,kw_armageddon,kw_army,...,ûïthe,ûïwe,ûïwhen,ûò,ûó,û÷extrem,û÷polit,åè,åê,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7592,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7593,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7594,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7595,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### **CONVERT DATAFRAME TO CSV FILE**

In [47]:
DATA_FILE_BOW = os.path.join(DATA_DIR, "bow/train_bow.csv")

dataframe_to_csv(train_df_with_target, DATA_FILE_BOW)