# Aim

1. Clean and Featurize data

In [17]:
import os
from datetime import datetime

import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from pandarallel import pandarallel

# from gensim.corpora import Dictionary
# from gensim.models import TfidfModel
# from gensim.utils import simple_preprocess
# from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [2]:
DATA_ROOT = "../data"

# os.makedirs(f"{DATA_ROOT}/train/features")

In [3]:
df_train = pd.read_pickle(f"{DATA_ROOT}/train/raw/data.pkl")
df_test = pd.read_pickle(f"{DATA_ROOT}/test/raw/data.pkl")

In [4]:
df_train.head(2)

Unnamed: 0,ID,Source,TMC,Start_Time,Distance(mi),Description,Side,City,County,State,...,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Severity
0,A-2478859,Bing,0.0,2016-02-08 00:37:08,3.23,Between Sawmill Rd/Exit 20 and OH-315/Olentang...,R,Dublin,Franklin,OH,...,False,False,False,False,False,Night,Night,Night,Night,3
1,A-1,MapQuest,201.0,2016-02-08 05:46:00,0.01,Right lane blocked due to accident on I-70 Eas...,R,Dayton,Montgomery,OH,...,False,False,False,False,False,Night,Night,Night,Night,3


In [5]:
df_test.head(2)

Unnamed: 0,ID,Source,TMC,Start_Time,Distance(mi),Description,Side,City,County,State,...,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Severity
0,A-3017746,Bing,0.0,2020-01-01 00:01:00,0.0,At Hampshire Rd/Exit 41 - Accident.,R,Westlake Village,Ventura,CA,...,False,False,False,False,False,Night,Night,Night,Night,2
1,A-3017745,Bing,0.0,2020-01-01 00:02:00,0.0,At Sheep Creek Rd - Accident.,L,Phelan,San Bernardino,CA,...,False,False,False,True,False,Night,Night,Night,Night,2


In [6]:
df_train.columns

Index(['ID', 'Source', 'TMC', 'Start_Time', 'Distance(mi)', 'Description',
       'Side', 'City', 'County', 'State', 'Zipcode', 'Timezone',
       'Airport_Code', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way',
       'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop',
       'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset',
       'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight',
       'Severity'],
      dtype='object')

## Cleaning : `Wind_Direction`

In [7]:
df_train["Wind_Direction"].unique()

array(['SW', 'Calm', 'SSW', 'WSW', 'WNW', 'NW', 'West', 'NNW', 'NNE',
       'South', 'W', 'North', 'Variable', 'SSE', 'SE', 'ESE', 'none',
       'East', 'NE', 'ENE', 'E', 'CALM', 'S', 'VAR', 'N'], dtype=object)

1. convert to upper case
1. replace VAR with variable

In [8]:
def clean_wind_direction(df):
    df["Wind_Direction"] = df["Wind_Direction"].str.upper()
    df["Wind_Direction"] = df["Wind_Direction"].apply(
        lambda x: "variable" if x == "VAR" else x
    )

    return df


df_train = clean_wind_direction(df_train)
df_test = clean_wind_direction(df_test)

In [9]:
print(sorted(df_train["Wind_Direction"].unique()))

['CALM', 'E', 'EAST', 'ENE', 'ESE', 'N', 'NE', 'NNE', 'NNW', 'NONE', 'NORTH', 'NW', 'S', 'SE', 'SOUTH', 'SSE', 'SSW', 'SW', 'VARIABLE', 'W', 'WEST', 'WNW', 'WSW', 'variable']


In [10]:
print(sorted(df_test["Wind_Direction"].unique()))

['CALM', 'E', 'ENE', 'ESE', 'N', 'NE', 'NNE', 'NNW', 'NONE', 'NW', 'S', 'SE', 'SSE', 'SSW', 'SW', 'W', 'WNW', 'WSW', 'variable']


## Cleaning : `Description`

In [18]:
import html
import re
import string
import unicodedata

from bs4 import BeautifulSoup


def rm_numbers(x):
    x = re.sub(r"[0-9]+", "", x)
    return x


def rm_html(x):
    x = html.unescape(x)
    x = BeautifulSoup(x).get_text()
    return x


def rm_url(x):
    x = re.sub("http*\S+", " ", x)
    return x


def rm_multiple_dots(x):
    x = re.sub(r"\.+", ". ", x)
    x = re.sub("\।+", ". ", x)
    return x


def rm_unicode(x):
    x = unicodedata.normalize("NFKD", x)
    return x


def rm_punctuation(x):
    x = re.sub("[%s]" % re.escape(string.punctuation.replace(".", "")), " ", x)
    return x


def rm_spaces(x):
    x = re.sub(" +", " ", x)
    return x


def rm_word(x, word):
    x = x.replace(word, "")
    return x


def clean_text(input_string):
    ss = input_string
    ss = rm_html(ss)
    ss = rm_url(ss)
    ss = rm_punctuation(ss)
    ss = rm_multiple_dots(ss)
    ss = rm_unicode(ss)
    ss = rm_spaces(ss)
    # ss = rm_numbers(ss)
    ss = rm_word(ss, "\n")
    ss = rm_word(ss, "\t")
    ss = ss.strip()

    return ss

In [20]:
pandarallel.initialize(verbose=True,)
df_train["Description"] = df_train["Description"].parallel_apply(
    lambda x: clean_text(x)
)

In [26]:
df_train["Description"].sample(5).values

array(['Accident on US 101 Oregon Coast Hwy near Cedar St.',
       'Accident on Bingle Rd at Houston Rosslyn Rd.',
       'Accident on Six Forks Rd at Lead Mine Rd.',
       'Right hand shoulder blocked due to accident on I 210 Eastbound before Exit 19 CA 2.',
       'Between VA 619 Exit 150 and VA 234 Exit 152 Accident.'],
      dtype=object)

In [24]:
pandarallel.initialize(verbose=True,)
df_test["Description"] = df_test["Description"].parallel_apply(lambda x: clean_text(x))

In [27]:
df_test["Description"].sample(5).values

array(['At Old Hiway Accident.',
       'Lane blocked due to accident on I 385 Northbound near Exit 34 Butler Rd.',
       'Accident on MN 36 Westbound at CR 35 Hadley Ave.',
       'At Southwood Plantation Rd Accident.',
       'Right lane blocked due to accident on Sam Houston Tlwy Eastbound at I 45 Gulf Fwy Exit 32.'],
      dtype=object)

## Featurizing : `Description`

1. Get K keywords
1. create a binary vector of dimension K
1. if presence of word_x mark that dimension 1

Why this feature?

1. this will allow us to capture important words describing an accident
1. these words inturn might help in identifying severity

- [YAKE github](https://github.com/LIAAD/yake)
- [Reference - Key word extractor](https://towardsdatascience.com/keyword-extraction-process-in-python-with-natural-language-processing-nlp-d769a9069d5c)

YAKE is a lightweight, unsupervised automatic keyword extraction method that relies on statistical text features extracted from individual documents to identify the most relevant keywords in the text.

In [13]:
import yake

kw_extractor = yake.KeywordExtractor()
text = ".".join(
    df_train["Description"].sample(n=100_000).tolist()
)  # extracting from a sample as compute intensive process
language = "en"
max_ngram_size = 1
deduplication_threshold = 0.1
numOfKeywords = 1000
custom_kw_extractor = yake.KeywordExtractor(
    lan=language,
    n=max_ngram_size,
    dedupLim=deduplication_threshold,
    top=numOfKeywords,
    features=None,
    stopwords=stopwords.words("english"),
)
keywords = custom_kw_extractor.extract_keywords(text)

In [28]:
# The lower the score, the more relevant the keyword is.
keywords

[('Accident', 6.169566081677733e-09),
 ('Northbound', 1.3069501235051463e-07),
 ('Hwy', 2.738028950102289e-06),
 ('ramp', 4.8401705088886265e-06),
 ('slow', 8.777147725501163e-06),
 ('Trl', 0.00014915096836162105),
 ('Mopac', 0.001457774067552258),
 ('Okeechobee', 0.0015743939528227214),
 ('Brookshire', 0.0016599371751449834),
 ('Huntington', 0.002962632417734576),
 ('NYS', 0.005783750908517111),
 ('Fuqua', 0.009709526119156922),
 ('Middlefield', 0.055070995136181314),
 ('JFK', 0.07611004069260154),
 ('Cedarhurst', 0.1031640040545067),
 ('57-56', 0.12423963791402771),
 ('PGBT.', 0.3325493080742511),
 ('Rhinecliff', 0.6395068253572433),
 ('Chavaneaux', 0.6552093622826601),
 ('Gibsonburg', 0.677368534171853)]

In [30]:
pd.to_pickle(
    keywords, f"{DATA_ROOT}/train/keywords.pkl",
)

In [34]:
# using top 15 words
keywords_list = [i[0] for i in keywords[:15]]

In [35]:
keywords_list

['Accident',
 'Northbound',
 'Hwy',
 'ramp',
 'slow',
 'Trl',
 'Mopac',
 'Okeechobee',
 'Brookshire',
 'Huntington',
 'NYS',
 'Fuqua',
 'Middlefield',
 'JFK',
 'Cedarhurst']

In [45]:
fuzz.partial_ratio("hello world 2", "hello world")  # demo of partial_ratio

100

In [57]:
def get_kw_vec(x, kw_list):
    vec = [fuzz.partial_ratio(i.lower(), x.lower()) for i in kw_list]
    vec = np.array(vec)
    vec = np.where(vec > 60, 1, 0).tolist()
    return vec

In [58]:
pandarallel.initialize(verbose=True,)
df_train["kw_vec"] = df_train["Description"].parallel_apply(
    lambda x: get_kw_vec(x, keywords_list)
)

In [59]:
pandarallel.initialize(verbose=True,)
df_test["kw_vec"] = df_test["Description"].parallel_apply(
    lambda x: get_kw_vec(x, keywords_list)
)

In [62]:
df_train["kw_vec"].sample(5).head()

2428374    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
209306     [1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]
849060     [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
70600      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1845646    [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Name: kw_vec, dtype: object

## Featurizing : `Zipcode`



[Reference encoding zipcode](https://stats.stackexchange.com/questions/94902/how-to-represent-geography-or-zip-code-in-machine-learning-model-or-recommender)
- As we are using decision trees we do not need to use one hot encoding
- Decision trees handle categorical variables

Why breakdown zip code?
- zip codes are hierarchical in nature
- using first N digits of zipcode will give us some understanding of the region 
- using next N digits will give understanding of next sub regions
- we also reduce the number of unique zipcodes this way
- this will enable model to learn better

In [64]:
df_train["zip_02"] = df_train["Zipcode"].str[:2]
df_test["zip_02"] = df_test["Zipcode"].str[:2]

In [65]:
df_train["zip_25"] = df_train["Zipcode"].str[2:5]
df_test["zip_25"] = df_test["Zipcode"].str[2:5]

Zip codes are of varying length that could be one of the features

`zip_len` could be one of the features


In [66]:
df_train["zip_len"] = df_train["Zipcode"].apply(len)
df_test["zip_len"] = df_test["Zipcode"].apply(len)

We can observe some compound zip codes like 

1. 43068-3402
1. 93401-8325
1. 60607-3612

`is_compound` could be one of the boolean variables

In [67]:
df_train["zip_is_compound"] = df_train["Zipcode"].apply(lambda x: "-" in x)
df_test["zip_is_compound"] = df_test["Zipcode"].apply(lambda x: "-" in x)

# Final features list

In [70]:
final_feature_list = [
    "ID",  # will be removing this before preparing modelling data
    "Source",
    "TMC",
    # "Start_Time", -> removing this as data is now sorted and split
    "Distance(mi)",
    # "Description", -> extracted features & removing this
    "Side",
    "City",
    "County",
    "State",
    # "Zipcode", -> extracted features & removing this
    "Timezone",
    "Airport_Code",
    "Temperature(F)",
    "Humidity(%)",
    "Pressure(in)",
    "Visibility(mi)",
    "Wind_Direction",
    "Wind_Speed(mph)",
    "Weather_Condition",
    "Amenity",
    "Bump",
    "Crossing",
    "Give_Way",
    "Junction",
    "No_Exit",
    "Railway",
    "Roundabout",
    "Station",
    "Stop",
    "Traffic_Calming",
    "Traffic_Signal",
    "Turning_Loop",
    "Sunrise_Sunset",
    "Civil_Twilight",
    "Nautical_Twilight",
    "Astronomical_Twilight",
    # engineered features
    "kw_vec",
    "zip_02",
    "zip_25",
    "zip_len",
    "zip_is_compound",
    # to predict
    "Severity",
]

In [71]:
os.makedirs(f"{DATA_ROOT}/train/featurized/", exist_ok=True)
os.makedirs(f"{DATA_ROOT}/test/featurized/", exist_ok=True)

In [73]:
df_train[final_feature_list].to_pickle(f"{DATA_ROOT}/train/featurized/data.pkl")
df_test[final_feature_list].to_pickle(f"{DATA_ROOT}/test/featurized/data.pkl")