In [1]:
# import necessary libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import nltk
import string 

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from textblob import TextBlob

In [3]:
# Datasets are downloable at:
# https://www.kaggle.com/landlord/multilingual-disaster-response-messages
# Importing anxiety data
train = pd.read_csv('../datasets/disaster_response_messages_training.csv')
test = pd.read_csv('../datasets/disaster_response_messages_test.csv')
val = pd.read_csv('../datasets/disaster_response_messages_validation.csv')

In [4]:
val.columns

Index(['id', 'split', 'message', 'original', 'genre', 'related', 'PII',
       'request', 'offer', 'aid_related', 'medical_help', 'medical_products',
       'search_and_rescue', 'security', 'military', 'child_alone', 'water',
       'food', 'shelter', 'clothing', 'money', 'missing_people', 'refugees',
       'death', 'other_aid', 'infrastructure_related', 'transport',
       'buildings', 'electricity', 'tools', 'hospitals', 'shops',
       'aid_centers', 'other_infrastructure', 'weather_related', 'floods',
       'storm', 'fire', 'earthquake', 'cold', 'other_weather',
       'direct_report'],
      dtype='object')

In [5]:
df = train.append(test)
df.shape

(23675, 42)

In [6]:
df.drop(columns = ['id', 'split', 'original'], inplace = True)
df = df.drop_duplicates(subset = 'message')

In [7]:
df['genre'] = df['genre'].replace(['direct', 'news', 'social'], [1, 0, 2])
df['content_length'] = df['message'].apply(len)
df['content_word_count']= df['message'].apply(lambda x: len(x.split()))

In [8]:
def custom_preprocessor(text):
    text = text.lower() #lowercases word
    text = re.sub(r'[^\w\s]', '', text) #removes punctuation
    text = re.sub(r'[0–9]', '', text) #removes any numbers
    text = re.sub('(<.*?>)', '', text) #removed html
    #copied from https://swatimeena989.medium.com/beginners-guide-for-preprocessing-text-data-f3156bec85ca
    
    lemmatizer = WordNetLemmatizer()
    text = lemmatizer.lemmatize(text)
    
    return text

In [9]:
df['message'] = df['message'].apply(custom_preprocessor)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23609 entries, 0 to 2628
Data columns (total 41 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   message                 23609 non-null  object
 1   genre                   23609 non-null  int64 
 2   related                 23609 non-null  int64 
 3   PII                     23609 non-null  int64 
 4   request                 23609 non-null  int64 
 5   offer                   23609 non-null  int64 
 6   aid_related             23609 non-null  int64 
 7   medical_help            23609 non-null  int64 
 8   medical_products        23609 non-null  int64 
 9   search_and_rescue       23609 non-null  int64 
 10  security                23609 non-null  int64 
 11  military                23609 non-null  int64 
 12  child_alone             23609 non-null  int64 
 13  water                   23609 non-null  int64 
 14  food                    23609 non-null  int64 
 15  she

In [12]:
df.shape

(23609, 41)

In [None]:
#path=r'/Users/suelemlee/Desktop/Capstone/'
path=r'/Users/suelemlee/Desktop/dsir-82/capstone/datasets/'
df.to_csv(path+'df_clean.csv', index = True)