In [1]:
# Imports
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
import csv
import pickle
import re
import requests

In [9]:
train_path = 'train.csv'
train_fieldnames = ['pid', 'survived', 'pclass', 'name', 'sex', 'age', 'sisp', 'pach', 'ticket', 'fare', 'cabin', 'port']
test_path = 'test.csv'
test_fieldnames = ['pid', 'pclass', 'name', 'sex', 'age', 'sisp', 'pach', 'ticket', 'fare', 'cabin', 'port']
clean_train_path = 'cleaned_train.csv'
clean_train_fieldnames = ['pid', 'survived', 'pclass', 'honor', 'origin', 'sex', 'age', 'sisp', 'pach', 'ticket', 'fare', 'cabin', 'port']
clean_test_path = 'cleaned_test.csv'
clean_test_fieldnames = ['pid', 'pclass', 'honor', 'origin', 'sex', 'age', 'sisp', 'pach', 'ticket', 'fare', 'cabin', 'port']

In [10]:
# Get all values in input data as dict {fieldname: set_of_values}
def get_values(train_path, train_fieldnames, test_path, test_fieldnames):
    input_values = defaultdict(set)
    csv.register_dialect('mixed', delimiter=',', escapechar=None, quoting=csv.QUOTE_MINIMAL)
    with open(train_path, mode='rt', errors='ignore') as train, open(test_path, mode='rt', errors='ignore') as test:
        train_reader = csv.DictReader(train, fieldnames=train_fieldnames, dialect='mixed')
        test_reader = csv.DictReader(test, fieldnames=test_fieldnames, dialect='mixed')
        next(train_reader, None) # skip header
        next(test_reader, None) # skip header
        for row in train_reader:
            for k, v in row.items():
                input_values[k].add(v)
        for row in test_reader:
            for k, v in row.items():
                input_values[k].add(v)
    return input_values

In [11]:
input_values = get_values(train_path, train_fieldnames, test_path, test_fieldnames)

In [12]:
input_values.keys()

dict_keys(['pid', 'survived', 'pclass', 'name', 'sex', 'age', 'sisp', 'pach', 'ticket', 'fare', 'cabin', 'port'])

In [15]:
# Conversion functions for cleaning data
identity = lambda x: x

def ints(values):
    values.discard('')
    conversion = {k: i for i, k in enumerate(values)}
    conversion[''] = len(conversion)
    return lambda v: conversion[v]

def bins(uppers):
    def conversion(v):
        last = len(uppers)
        if v == '':
            return last + 1
        v = int(float(v))
        for i, upper in enumerate(uppers):
            if v < upper:
                return i
        return last
    return conversion

honorifics = {
    'Mr' : 1,
    'Don': 2, 'Sir': 2, 'Rev': 2, 'Dr': 2, 'Major': 2, 'Col': 2, 'Capt': 2, 'Jonkheer': 2,
    'Master': 3,
    'Mlle': 4, 'Ms': 4, 'Miss': 4,
    'Mrs': 5, 'Mme': 5, 'Dona': 5, 'Lady': 5, 'the Countess': 5
}

with open('name_origins.pickle', 'rb') as file:
    first_name_origins, last_name_origins = pickle.load(file)

origin_map = {
    'unknown': 5,
    'English': 0, 'Irish': 0, 'Cornish': 0,  'Welsh': 0, 'Scottish': 0,
    'French': 1, 'Romance': 1, 'Italian': 1, 'Portuguese': 1, 'Spanish': 1, 'Catalan': 1, 'Roman': 1,
    'Swedish': 2, 'Finnish': 2, 'Danish': 2, 'Norwegian': 2, 'Icelandic': 2, 'Mythology': 2,
    'German': 3, 'Dutch': 3, 'Polish': 3,  'Hungarian': 3, 'Czech': 3, 'Slovak': 3,
    'Hebrew': 4, 'Biblical': 4, 'Yiddish': 4, 'Jewish': 4,
    'Armenian': 4, 'Georgian': 4, 'Slovene': 4, 'Turkish': 4,
    'Russian': 4, 'Greek': 4, 'Serbian': 4, 'Bulgarian': 4,
    'Ukrainian': 4, 'Bosnian': 4, 'Lithuanian': 4, 'Croatian': 4, 'Estonian': 4,
    'Punjabi': 4, 'Culture': 4, 'Chinese': 4, 'Urdu': 4, 'African': 4, 'Arabic': 4,
}

def split_name(name):
    match = re.search(r'(\w+), ([\w+ ]+)\. \(?(\w+)', name)
    if not match:
        print('Regex Error: ', name)
        return None
    honor = honorifics[match.group(2)]
    lorigin = last_name_origins[match.group(1)]
    forigin = first_name_origins[match.group(3)]
    last = origin_map[lorigin]
    first = origin_map[forigin]
    if first == last:
        origin = first
    elif first == 'unknown':
        origin = last
    elif last == 'unknown':
        origin = first
    else:
        origin = first
    return (honor, origin)

def split_ticket(ticket):
    match = re.search(r'\d+$|LINE', ticket)
    if not match:
        print('Ticket Error: ', ticket)
        return None
    length = len(match.group(0))
    return 4 if length <= 4 else length

In [16]:
converters = {
    'pid': identity,
    'survived': identity,
    'pclass': ints(input_values['pclass']),
    'name': split_name,
    'sex': ints(input_values['sex']),
    'age': bins([3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 35, 40, 50]),
    'sisp': bins([1, 2, 3]),
    'pach': bins([1]),
    'ticket': split_ticket,
    'fare': bins([10, 20, 30, 50, 80]),
    'cabin': lambda s: 1 if s else 0,
    'port': ints(input_values['port'])
}

In [17]:
# Clean data and output to file for further processing
def clean_data(in_path, in_fieldnames, out_path, out_fieldnames):
    csv.register_dialect('ints', delimiter=',', escapechar=None, quoting=csv.QUOTE_NONE)
    csv.register_dialect('ints', delimiter=',', escapechar=None, quoting=csv.QUOTE_NONE)
    with open(in_path, mode='rt', errors='ignore') as in_file, open(out_path, mode='a+', errors='ignore') as out_file:
        reader = csv.DictReader(in_file, fieldnames=in_fieldnames, dialect='mixed')
        next(reader, None) # skip header
        writer = csv.DictWriter(out_file, fieldnames=out_fieldnames, dialect='ints')
        out_file.truncate(0) # delete any file contents
        writer.writeheader()
        for row in reader:
            iter_items = list(row.items())
            for k, v in iter_items:
                if k == 'name':
                    row['honor'], row['origin'] = converters[k](v)
                else:
                    row[k] = converters[k](v)
            del row['name']
            writer.writerow(row)

In [18]:
clean_data(train_path, train_fieldnames, clean_train_path, clean_train_fieldnames)
clean_data(test_path, test_fieldnames, clean_test_path, clean_test_fieldnames)

In [None]:
# Pickle results and don't repeat unless necessary!
# Make conversion maps for name origins
def make_name_maps():
    names = list(input_values['name'])
    fnames = set()
    lnames = set()
    for name in names:
        match = re.search(r'(\w+),[\w+ ]+\. \(?(\w+)', name)
        if match:
            lnames.add(match.group(1))
            fnames.add(match.group(2))
        else:
            print('Regex Error: ', name)
    fnames = list(fnames)
    lnames = list(lnames)
    forigins = defaultdict(dict)
    lorigins = defaultdict(dict)
    with requests.session() as s:
        for name in fnames:
            forigins[name] = name_origin(name, s, surname=0)
        for name in lnames:
            lorigins[name] = name_origin(name, s, surname=1)
    return (forigins, lorigins)

def name_origin(name, session, surname=0):
    url = ('https://www.behindthename.com/name/', 'https://surnames.behindthename.com/name/')[surname]
    response = session.get(url + name.strip(), stream=True)
    if response.status_code != 200:
        return 'unknown'
    raw_html = response.content
    html = BeautifulSoup(raw_html, 'html.parser')
    usage = html.find('a', attrs={'class': 'usg'})
    if not usage:
        return 'unknown'
    usage = re.sub(r' \(.*\)', '', usage.string)
    split = usage.split()
    if len(split) > 1:
        usage = split[-1]
    return usage

#forig, lorig = make_name_maps()
#to_save = [forig, lorig]
#with open('name_origins.pickle', 'wb') as dump:
#    pickle.dump(to_save, dump)