In [1]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [2]:
args = Namespace(
    raw_dataset_csv="E:\\reading_books\\PyTorchNLPBook-master\\data\\surnames\\surnames.csv",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="E:\\reading_books\\PyTorchNLPBook-master\\data\\surnames\\surnames_with_splits.csv",
    seed=1337
)

In [3]:
# Read raw data
surnames = pd.read_csv(args.raw_dataset_csv, header=0)

In [4]:
surnames.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [5]:
# Unique classes
set(surnames.nationality)

{'Arabic',
 'Chinese',
 'Czech',
 'Dutch',
 'English',
 'French',
 'German',
 'Greek',
 'Irish',
 'Italian',
 'Japanese',
 'Korean',
 'Polish',
 'Portuguese',
 'Russian',
 'Scottish',
 'Spanish',
 'Vietnamese'}

In [6]:
# Splitting train by nationality
# Create dict
'''
surnames.iterrows()
'''
by_nationality = collections.defaultdict(list)
for _, row in surnames.iterrows():
    by_nationality[row.nationality].append(row.to_dict())

## dataFrame.iterrows()方法

In [7]:
x = surnames.iterrows()

In [8]:
i = 0
m = []
n = []

In [9]:
for index,a in x:
    m.append(index)
    n.append(a)
    i = i + 1
    if i > 5:break

In [10]:
m

[0, 1, 2, 3, 4, 5]

In [11]:
n

[surname        Woodford
 nationality     English
 Name: 0, dtype: object, surname          Coté
 nationality    French
 Name: 1, dtype: object, surname           Kore
 nationality    English
 Name: 2, dtype: object, surname         Koury
 nationality    Arabic
 Name: 3, dtype: object, surname         Lebzak
 nationality    Russian
 Name: 4, dtype: object, surname         Obinata
 nationality    Japanese
 Name: 5, dtype: object]

In [12]:
n[0]

surname        Woodford
nationality     English
Name: 0, dtype: object

In [13]:
n[0].to_dict()

{'surname': 'Woodford', 'nationality': 'English'}

试验结束

In [7]:
by_nationality

defaultdict(list,
            {'English': [{'surname': 'Woodford', 'nationality': 'English'},
              {'surname': 'Kore', 'nationality': 'English'},
              {'surname': 'Essop', 'nationality': 'English'},
              {'surname': 'Jefferson', 'nationality': 'English'},
              {'surname': 'Dorrington', 'nationality': 'English'},
              {'surname': 'Jeffries', 'nationality': 'English'},
              {'surname': 'Douthwaite', 'nationality': 'English'},
              {'surname': 'Readle', 'nationality': 'English'},
              {'surname': 'Jones', 'nationality': 'English'},
              {'surname': 'Topham', 'nationality': 'English'},
              {'surname': 'Bellamy', 'nationality': 'English'},
              {'surname': 'Leggett', 'nationality': 'English'},
              {'surname': 'Lilley', 'nationality': 'English'},
              {'surname': 'Ayliffe', 'nationality': 'English'},
              {'surname': 'Twiggs', 'nationality': 'English'},
            

In [12]:
a = by_nationality.items() # a's type is dict_items

In [13]:
b = sorted(by_nationality.items()) # b's type is list
b

[('Arabic',
  [{'surname': 'Koury', 'nationality': 'Arabic'},
   {'surname': 'Rahal', 'nationality': 'Arabic'},
   {'surname': 'Mifsud', 'nationality': 'Arabic'},
   {'surname': 'Srour', 'nationality': 'Arabic'},
   {'surname': 'Boulos', 'nationality': 'Arabic'},
   {'surname': 'Hakimi', 'nationality': 'Arabic'},
   {'surname': 'Kalb', 'nationality': 'Arabic'},
   {'surname': 'Maloof', 'nationality': 'Arabic'},
   {'surname': 'Atiyeh', 'nationality': 'Arabic'},
   {'surname': 'Totah', 'nationality': 'Arabic'},
   {'surname': 'Isa', 'nationality': 'Arabic'},
   {'surname': 'Saliba', 'nationality': 'Arabic'},
   {'surname': 'Shalhoub', 'nationality': 'Arabic'},
   {'surname': 'Attia', 'nationality': 'Arabic'},
   {'surname': 'Tuma', 'nationality': 'Arabic'},
   {'surname': 'Totah', 'nationality': 'Arabic'},
   {'surname': 'Malouf', 'nationality': 'Arabic'},
   {'surname': 'Kalb', 'nationality': 'Arabic'},
   {'surname': 'Bazzi', 'nationality': 'Arabic'},
   {'surname': 'Mansour', 'nation

In [14]:
# Create split data
final_list = []
np.random.seed(args.seed)
for _, item_list in sorted(by_nationality.items()):
    np.random.shuffle(item_list)
    n = len(item_list)
    n_train = int(args.train_proportion*n)
    n_val = int(args.val_proportion*n)
    n_test = int(args.test_proportion*n)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'  
    
    # Add to final list
    final_list.extend(item_list) # Notice the difference between 'extend()' and 'append()'

In [15]:
final_list

[{'surname': 'Totah', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Abboud', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Fakhoury', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Srour', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Sayegh', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Cham', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Haik', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Kattan', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Khouri', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Antoun', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Wasem', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Srour', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Seif', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Guirguis', 'nationality': 'Arabic', 'split': 'train'},
 {'surname': 'Sarkis', 'nationality': 'Arabic', 'split': 'train'},
 

In [17]:
# Write split data to file
final_surnames = pd.DataFrame(final_list)
final_surnames

Unnamed: 0,surname,nationality,split
0,Totah,Arabic,train
1,Abboud,Arabic,train
2,Fakhoury,Arabic,train
3,Srour,Arabic,train
4,Sayegh,Arabic,train
5,Cham,Arabic,train
6,Haik,Arabic,train
7,Kattan,Arabic,train
8,Khouri,Arabic,train
9,Antoun,Arabic,train


In [18]:
final_surnames.split.value_counts()

train    7680
test     1660
val      1640
Name: split, dtype: int64

In [19]:
final_surnames.head()

Unnamed: 0,surname,nationality,split
0,Totah,Arabic,train
1,Abboud,Arabic,train
2,Fakhoury,Arabic,train
3,Srour,Arabic,train
4,Sayegh,Arabic,train


In [20]:
# Write munged data to CSV
final_surnames.to_csv(args.output_munged_csv, index=False)