In [1]:
import mltable

In [2]:
tbl = mltable.load('../data')

In [3]:
tbl.show()

Unnamed: 0,age,location,orgsz,style,yoe,projects,accept
0,34.0,South America,751.0,spaces,19.0,26.0,yes
1,36.0,Oceania,620.0,spaces,17.0,13.0,no
2,27.0,Asia,533.0,spaces,16.0,11.0,no
3,36.0,North America,332.0,tabs,8.0,6.0,no
4,35.0,Oceania,402.0,spaces,14.0,10.0,no
5,36.0,South America,340.0,tabs,10.0,6.0,no
6,40.0,Oceania,508.0,spaces,15.0,8.0,no
7,44.0,North America,629.0,tabs,9.0,5.0,no
8,34.0,North America,688.0,tabs,7.0,4.0,yes
9,37.0,Oceania,531.0,spaces,14.0,9.0,no


In [4]:
df = tbl.to_pandas_dataframe()
print(df.head(20))

     age       location  orgsz   style   yoe  projects accept
0   34.0  South America  751.0  spaces  19.0      26.0    yes
1   36.0        Oceania  620.0  spaces  17.0      13.0     no
2   27.0           Asia  533.0  spaces  16.0      11.0     no
3   36.0  North America  332.0    tabs   8.0       6.0     no
4   35.0        Oceania  402.0  spaces  14.0      10.0     no
5   36.0  South America  340.0    tabs  10.0       6.0     no
6   40.0        Oceania  508.0  spaces  15.0       8.0     no
7   44.0  North America  629.0    tabs   9.0       5.0     no
8   34.0  North America  688.0    tabs   7.0       4.0    yes
9   37.0        Oceania  531.0  spaces  14.0       9.0     no
10  29.0        Oceania  718.0  spaces  16.0       8.0     no
11  40.0  South America  157.0    tabs  10.0       5.0     no
12  36.0  South America  112.0  spaces  17.0      10.0     no
13  42.0  North America  362.0    tabs   8.0       4.0     no
14  31.0  North America  427.0    tabs  10.0       5.0     no
15  37.0

In [33]:
locations = df['location'].unique()
loc_to_ix = { locations[i].lower(): i for i in range(len(locations)) }
loc_to_ix

{'south america': 0,
 'oceania': 1,
 'asia': 2,
 'north america': 3,
 'europe': 4,
 'africa': 5,
 'antartica': 6}

In [34]:
styles = df['style'].unique()
style_to_ix = { styles[i].lower(): i for i in range(len(styles)) }
style_to_ix

{'spaces': 0, 'tabs': 1}

In [59]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [43]:
lemb = nn.Embedding(len(loc_to_ix), 4)
semb = nn.Embedding(len(style_to_ix), 2)

In [46]:
lx = lemb(torch.tensor([loc_to_ix['oceania'], loc_to_ix['europe']]))
sx = semb(torch.tensor([style_to_ix['tabs'], style_to_ix['spaces']]))

In [47]:
lx, sx

(tensor([[-0.9111, -1.0998, -1.6127, -0.4499],
         [ 0.7109,  1.5743,  1.2630,  1.7974]], grad_fn=<EmbeddingBackward0>),
 tensor([[ 0.3654, -0.5149],
         [-0.7980,  0.7376]], grad_fn=<EmbeddingBackward0>))

In [48]:
torch.cat((lx, sx), dim=1)

tensor([[-0.9111, -1.0998, -1.6127, -0.4499,  0.3654, -0.5149],
        [ 0.7109,  1.5743,  1.2630,  1.7974, -0.7980,  0.7376]],
       grad_fn=<CatBackward0>)

In [57]:
row = df.iloc[0]
row['age'], row['orgsz'], row['yoe'], row['projects']

(34.0, 751.0, 19.0, 26.0)

In [178]:
class ProgrammerDataset(Dataset):
    def __init__(self, path: str):
        tbl = mltable.load(path)
        self.df = tbl.to_pandas_dataframe()
        locations = df['location'].unique()
        styles = df['style'].unique()

        self.transforms = {
            'location': { locations[i].lower(): i for i in range(len(locations)) },
            'style': { styles[i].lower(): i for i in range(len(styles)) },
            'age': [df['age'].min(), df['age'].max()],
            'orgsz': [df['orgsz'].min(), df['orgsz'].max()],
            'yoe': [df['yoe'].min(), df['yoe'].max()],
            'projects': [df['projects'].min(), df['projects'].max()],
            'accept': 'yes'
        }

    def get(self, row, col: str):
        xform = self.transforms[col]
        if isinstance(xform, dict):
            return xform[row[col].lower()]
        elif isinstance(xform, list):
            return (row[col] - xform[0]) / (xform[1] - xform[0])
        else:
            return 1 if xform == row[col] else 0
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        v = [self.get(row, x) for x in self.transforms.keys()]
        return torch.tensor(v[0]), torch.tensor(v[1]), torch.tensor(v[2:-1], dtype=torch.float32), torch.tensor(v[-1])

In [179]:
dataset = ProgrammerDataset('../data')
dataset.transforms

{'location': {'south america': 0,
  'oceania': 1,
  'asia': 2,
  'north america': 3,
  'europe': 4,
  'africa': 5,
  'antartica': 6},
 'style': {'spaces': 0, 'tabs': 1},
 'age': [11.0, 58.0],
 'orgsz': [-495.0, 1371.0],
 'yoe': [5.0, 24.0],
 'projects': [0.0, 40.0],
 'accept': 'yes'}

In [180]:
dataset[1]

(tensor(1), tensor(0), tensor([0.5319, 0.5975, 0.6316, 0.3250]), tensor(0))

In [181]:
from torch import nn
import torch.nn.functional as F

In [187]:
class ProgrammerModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed1 = nn.Embedding(7, 4)
        self.embed2 = nn.Embedding(2, 2)
        self.fc1 = nn.Linear(10, 1)

    def forward(self, x):
        e1 = self.embed1(x[0])
        e2 = self.embed2(x[1])
        l1 = self.fc1(torch.cat((e1, e2, x[2]), dim=1))
        return torch.sigmoid(l1)

In [188]:
model = ProgrammerModel()

In [189]:
training = DataLoader(dataset, batch_size=6, shuffle=True)

In [190]:
x = next(iter(training))

In [191]:
model.forward(x)

tensor([[0.5298],
        [0.5928],
        [0.5225],
        [0.5910],
        [0.5386],
        [0.6814]], grad_fn=<SigmoidBackward0>)