In [21]:
import string
from nltk.tokenize import word_tokenize
import pandas as pd

def labeling(label, string):
    return [(word, label) for word in word_tokenize(string)]

def branding(df):
    t, br = df.title, df.brand
    start = (t.lower()).find(br.lower())
    end = start + len(br)
    labeled_title = labeling('0', t[:start]) + labeling('BRAND', t[start:end]) + labeling('0', t[end:])
    return labeled_title

def word2features(row, i):
    word = str(row.title[i][0])
    features = {
        'root_cat': row.root_cat,
        'bias': 1,
        'word_position': i,
        'word_lower': word.lower(),
        'word[-2:]': word[-2:],
        'word_isUpper': word.isupper(),
        'word_isTitle': word.istitle(),
        'word_isDigit': word.isdigit(),
    }

    if i > 0:
        prev_word = str(row.title[i-1][0])
        features.update({
            'prev_word_lower': prev_word.lower(),
            'prev_word_isTitle': prev_word.istitle(),
            'prev_word_isUpper': prev_word.isupper(),
        })
    else:
        features['BOS'] = True
    
    if i < len(row.title) - 1:
        next_word = str(row.title[i+1][0])
        features.update({
            'next_word_lower': next_word.lower(),
            'next_word_isTitle': next_word.istitle(),
            'next_word_isUpper': next_word.isupper(),
            'next_word_anyDigit': any(ch.isdigit() for ch in next_word),
            'next_word_isPuctuation': next_word in string.punctuation,
        })
    else:
        features['EOS'] = True
    
    if i > 1:
        prev1 = str(row.title[i-1][0])
        prev2 = str(row.title[i-2][0])
        features.update({
            '-2ngram': '{} {}'.format(prev1, prev2)
        })
    
    if i < len(row.title) - 2:
        next1 = str(row.title[i+1][0])
        next2 = str(row.title[i+2][0])
        features.update({
            '+2ngram': '{} {}'.format(next1, next2)
        })
    
    return features

class DataLoader:
    """
    data from eBay
    columns:
    product title, brand, root_cat (root category of item)
    """
    def get_data(self, file_name='../data/train.csv'):
        df = pd.read_csv(filename, index=False)
        df['origin_title'] = df['title'].values
        df['title'] = df.apply(branding, axis=1)
        df['features'] = df.apply(lambda row: [word2features(row, i)
                                    for i in range(len(row.title))], axis=1)
        df['labels'] = df.apply(lambda row: [label for token, label in row.title], axis=1)

        return df



In [8]:
df = pd.read_csv('../data/train.csv')

In [10]:
df['origin_title'] = df['title'].values

In [11]:
df['title'] = df.apply(branding, axis=1)

In [13]:
df['title'][1]

[('BRIO', 'BRAND'),
 ('Magnetic', '0'),
 ('Building', '0'),
 ('Wooden', '0'),
 ('Blocks', '0'),
 ('Toddler', '0'),
 ('Activity', '0'),
 ('Toy', '0'),
 ('Ages', '0'),
 ('1+', '0'),
 ('NEW', '0')]

In [15]:
df['origin_title'][1]

'BRIO Magnetic Building Wooden Blocks Toddler Activity Toy Ages 1+ NEW'

In [17]:
df['brand'][1]

'Brio'

In [22]:
df['features'] = df.apply(lambda row: [word2features(row, i)
                                    for i in range(len(row.title))], axis=1)

In [24]:
df['features'][1]

[{'+2ngram': 'Magnetic Building',
  'BOS': True,
  'bias': 1,
  'next_word_anyDigit': False,
  'next_word_isPuctuation': False,
  'next_word_isTitle': True,
  'next_word_isUpper': False,
  'next_word_lower': 'magnetic',
  'root_cat': 2984,
  'word[-2:]': 'IO',
  'word_isDigit': False,
  'word_isTitle': False,
  'word_isUpper': True,
  'word_lower': 'brio',
  'word_position': 0},
 {'+2ngram': 'Building Wooden',
  'bias': 1,
  'next_word_anyDigit': False,
  'next_word_isPuctuation': False,
  'next_word_isTitle': True,
  'next_word_isUpper': False,
  'next_word_lower': 'building',
  'prev_word_isTitle': False,
  'prev_word_isUpper': True,
  'prev_word_lower': 'brio',
  'root_cat': 2984,
  'word[-2:]': 'ic',
  'word_isDigit': False,
  'word_isTitle': True,
  'word_isUpper': False,
  'word_lower': 'magnetic',
  'word_position': 1},
 {'+2ngram': 'Wooden Blocks',
  '-2ngram': 'Magnetic BRIO',
  'bias': 1,
  'next_word_anyDigit': False,
  'next_word_isPuctuation': False,
  'next_word_isTitle':

In [25]:
df['labels'] = df.apply(lambda row: [label for token, label in row.title], axis=1)

In [26]:
df['labels']

0                          [BRAND, 0, 0, 0, 0, 0, 0, 0, 0]
1                    [BRAND, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2           [0, BRAND, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
3           [0, BRAND, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
4        [BRAND, BRAND, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
                               ...                        
29995                      [0, 0, 0, 0, 0, 0, 0, 0, BRAND]
29996    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, BRAND, BRAND...
29997                      [0, 0, 0, 0, 0, 0, 0, 0, BRAND]
29998                      [0, 0, 0, 0, 0, 0, 0, 0, BRAND]
29999                      [0, 0, 0, 0, 0, 0, 0, 0, BRAND]
Name: labels, Length: 30000, dtype: object

In [30]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    df['features'], df['labels'],
    test_size=0.2,
    random_state=123
)
test_index = x_test.index

In [32]:
print(len(x_train))
print(len(x_test))

24000
6000


In [31]:
df[df.index.isin(test_index)].reindex(test_index).reset_index().origin_title

0       CONTE A PARIS SKETCHING CRAYONS with Assorted ...
1       Brand New Replay Anbass Mens Jeans Slim Fit RR...
2       FOREVER DREAMING Ladies Womens Chunky Slipper ...
3       HP Compaq dc7100 Tower HDD Hard Disk Drive SAT...
4       ASOS LADIES BLUE WHITE STRIPED JERSEY PLAYSUIT...
                              ...                        
5995                  Cisco 72-4226-01 Cascade Cable 50cm
5996    Sea Gems Gift Boxed Genuine Crystal Set Rose &...
5997          Polar FT2 Heart Rate Monitor Blue Strap NEW
5998    Glorafilia Arts and Craft Floral Needlepoint T...
5999    Hidden Fashion Womens Mid Rise Animal Leopard ...
Name: origin_title, Length: 6000, dtype: object