In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from torchtext import data

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import numpy as np
import pandas as pd
import random
import ast

In [2]:
train = {}
test = {}
X_train = []
X_train_vect = []
Y_train = []
X_test = []
X_test_vect = []
Y_test = []

pos_tags = ['JJ','JJR','JJS','NNP','NNPS','RB','RBR','RBS']
torch.cuda.set_device(0)
torch.cuda.get_device_name(0)

device = torch.device("cuda:0")

random.seed(42)

In [3]:
with open(r'data/male_tagged_train.txt', encoding = 'charmap') as mtagged_train:
    for line in mtagged_train:
        line = str(line.strip())
        line = ast.literal_eval(line)
        train[tuple(line)] = 0

with open(r'data/male_tagged_test.txt', encoding = 'charmap') as mtagged_test:
    for line in mtagged_test:
        line = str(line.strip())
        line = ast.literal_eval(line)
        test[tuple(line)] = 0

with open(r'data/female_tagged_train.txt', encoding = 'charmap') as ftagged_train:
    for line in ftagged_train:
        line = str(line.strip())
        line = ast.literal_eval(line)
        train[tuple(line)] = 1
        
with open(r'data/female_tagged_test.txt', encoding = 'charmap') as ftagged_test:
    for line in ftagged_test:
        line = str(line.strip())
        line = ast.literal_eval(line)
        test[tuple(line)] = 1

In [4]:
train_keys = list(train.keys())
random.shuffle(train_keys)
train_set =  [(key, train[key]) for key in train_keys]


test_keys = list(test.keys())
random.shuffle(test_keys)
test_set= [(key, test[key]) for key in test_keys]

In [5]:
def str_to_vect(x):
    pos_dict = {key : 0 for key in pos_tags}
    tag_list = []
    for i in range(len(x)):
        temp = x[i][1]
        tag_list.append(temp)
    for j in range(len(tag_list)):
        if str(tag_list[j]) in list(pos_dict.keys()):
            pos_dict[tag_list[j]] += 1
    
    pos_vector = list(pos_dict.values())
    total = sum(pos_vector)
    pos_prop_vector = []
    if total != 0:
        for k in range(len(pos_vector)):
            pos_prop_vector.append(pos_vector[k] / total)

        out = np.array(pos_prop_vector)
    else:
        out = np.zeros((8,))

    return out

In [6]:
for i in range(len(train_set)):
    X_train.append(train_set[i][0])
    Y_train.append(train_set[i][1])

for j in range(len(test_set)):
    X_test.append(test_set[j][0])
    Y_test.append(test_set[j][1])

In [7]:
'''x_indices = []
for i in range(len(X_train)):
    if(len(X_train[i]) > 0):
        answer = True 
    else:
        x_indices.append(i)

for index in sorted(x_indices, reverse=True):
    del X_train[index]
    del Y_train[index]

y_indices = []
for i in range(len(X_test)):
    if(len(X_test[i]) > 0):
        answer = True
    else:
        y_indices.append(i)

for index in sorted(y_indices, reverse=True):
    del X_test[index]
    del Y_test[index]'''

'x_indices = []\nfor i in range(len(X_train)):\n    if(len(X_train[i]) > 0):\n        answer = True \n    else:\n        x_indices.append(i)\n\nfor index in sorted(x_indices, reverse=True):\n    del X_train[index]\n    del Y_train[index]\n\ny_indices = []\nfor i in range(len(X_test)):\n    if(len(X_test[i]) > 0):\n        answer = True\n    else:\n        y_indices.append(i)\n\nfor index in sorted(y_indices, reverse=True):\n    del X_test[index]\n    del Y_test[index]'

In [8]:
for i in range(len(X_train)):
    X_train_vect.append(str_to_vect(X_train[i]))

for j in range(len(X_test)):
    X_test_vect.append(str_to_vect(X_test[j]))

In [9]:
df = pd.DataFrame()

In [10]:
X = X_train_vect + X_test_vect
Y = Y_train + Y_test
df['pos'] = X_train_vect 
df['target'] = Y_train 

In [11]:
Y.count(1)

96112

In [12]:
def get_top_data(top_n = 30000):
    top_data_df_male = df[df['target'] == 0].head(top_n)
    top_data_df_female = df[df['target'] == 1].head(top_n)
    data_df_small = pd.concat([top_data_df_male, top_data_df_female])
    return data_df_small

In [13]:
top_data_df_small = get_top_data(top_n=114250)

In [14]:
from sklearn.model_selection import train_test_split
def split_train_test(top_data_df_small, test_size=0.2, shuffle_state=True):
    X_train_vect, X_test_vect, Y_train, Y_test = train_test_split(top_data_df_small[['pos']], 
                                                        top_data_df_small['target'], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train genders")
    print(Y_train.value_counts())
    print("Value counts for Test genders")
    print(Y_test.value_counts())
    print(type(X_train_vect))
    print(type(Y_train))
    X_train_vect = X_train_vect.reset_index()
    X_test_vect = X_test_vect.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train_vect.head())
    return X_train_vect, X_test_vect, Y_train, Y_test

# Call the train_test_split
X_train_vect, X_test_vect, Y_train, Y_test = split_train_test(top_data_df_small)

Value counts for Train genders
1    61045
0    59261
Name: target, dtype: int64
Value counts for Test genders
1    15166
0    14911
Name: target, dtype: int64
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
    index                                       pos
0   57701  [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1   85462  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
2  101770  [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
3   53504  [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
4  116594  [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]


In [15]:
test_df = pd.DataFrame()

In [16]:
print(Y_train)

         index  target
0        57701       1
1        85462       1
2       101770       0
3        53504       0
4       116594       1
...        ...     ...
120301   72059       0
120302   84269       1
120303   60468       1
120304   16270       0
120305   15394       0

[120306 rows x 2 columns]


In [17]:
test_df['pos'] = X_test_vect['pos']
test_df['target'] = Y_test['target']

In [18]:
test_df.head(10)

Unnamed: 0,pos,target
0,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",1
1,"[0.0, 0.0, 0.0, 0.5, 0.0, 0.5, 0.0, 0.0]",0
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",0
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",1
4,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",1
5,"[0.5, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0]",0
6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1
7,"[0.0, 0.0, 0.0, 0.6666666666666666, 0.0, 0.333...",1
8,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0
9,"[0.5, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0]",1


In [19]:
train_df = pd.DataFrame()
train_df['pos'] = X_train_vect['pos']
train_df['target'] = Y_train['target']

In [20]:
train_df.head(50)

Unnamed: 0,pos,target
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",1
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1
2,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0
3,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0
4,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",1
5,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0
6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0
7,"[0.0, 0.0, 0.0, 0.25, 0.0, 0.75, 0.0, 0.0]",1
8,"[0.14285714285714285, 0.0, 0.0, 0.285714285714...",1
9,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",0


In [21]:
SEED = 42

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [22]:
print(len(train_df))
print(len(test_df))

120306
30077


In [23]:
train_x = train_df['pos'].to_numpy()
train_x = np.stack(train_x)
train_x_shape = train_x.shape
#train_x = train_x.reshape((1, train_x_shape[0], train_x_shape[1]))
#train_x.reshape(train_x_shape[0], train_x_shape[1], 1)
train_x = train_x.astype('int64')
train_x = torch.Tensor(train_x)
train_x.type(torch.cuda.LongTensor)
train_x = train_x.unsqueeze(dim=2)
train_x = train_x.float()
train_x = train_x.cuda()

train_y = train_df['target'].to_numpy()
train_y = np.stack(train_y)
train_y_shape = train_y.shape
#train_y.reshape(1, train_y_shape[0], train_y_shape[1])
train_y = train_y.astype('int64')
train_y = torch.Tensor(train_y)
train_y.type(torch.cuda.LongTensor)
train_y = train_y.long()
train_y = train_y.cuda()

test_x = test_df['pos'].to_numpy()
test_x = np.stack(test_x)
test_x_shape = test_x.shape
#test_x = test_x.reshape((1, test_x_shape[0], test_x_shape[1]))
#test_x.reshape(test_x_shape[0], test_x_shape[1], 1)
test_x = test_x.astype('int64')
test_x = torch.Tensor(test_x)
test_x.type(torch.cuda.LongTensor)
test_x = test_x.unsqueeze(dim=2)
test_x = test_x.float()
test_x = test_x.cuda()

test_y = test_df['target'].to_numpy()
test_y = np.stack(test_y)
test_y_shape = test_y.shape
#test_y.reshape(1, test_y_shape[0], test_y_shape[1])
test_y = test_y.astype('int64')
test_y = torch.Tensor(test_y)
test_y = test_y.long()
test_y = test_y.cuda()

In [24]:
print(train_x.type())
print(train_y.type())

torch.cuda.FloatTensor
torch.cuda.LongTensor


In [25]:
print(train_x)

tensor([[[0.],
         [0.],
         [0.],
         ...,
         [1.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        ...,

        [[1.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]]], device='cuda:0')


In [26]:
print(train_x.shape)
print(train_y.shape)

torch.Size([120306, 8, 1])
torch.Size([120306])


In [27]:
print(test_x.shape)
print(test_y.shape)

torch.Size([30077, 8, 1])
torch.Size([30077])


In [28]:
train = torch.utils.data.TensorDataset(train_x, train_y)
train_dataset = torch.utils.data.DataLoader(train, batch_size = 200, shuffle = True)

test = torch.utils.data.TensorDataset(test_x, test_y)
test_dataset = torch.utils.data.DataLoader(test, batch_size = 200, shuffle = False)

In [29]:
print(train_dataset)
print(test_dataset)
print(len(train_dataset.dataset))

<torch.utils.data.dataloader.DataLoader object at 0x00000197C61E5F48>
<torch.utils.data.dataloader.DataLoader object at 0x00000197C61E5048>
120306


In [30]:
batch_size = 200
n_iters = 3000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

In [31]:
print(num_epochs)

996


In [32]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
    
        self.hidden_dim = hidden_dim

        self.layer_dim = layer_dim

        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True).to(device)

        self.fc = nn.Linear(hidden_dim, output_dim).to(device)

    def forward(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(device)

        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(device)

        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
 
        out = self.fc(out[:, -1, :]) 
        return out

In [33]:
input_dim = 1
hidden_dim = 10
layer_dim = 1
output_dim = 2

In [34]:
model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
model.cuda()

LSTMModel(
  (lstm): LSTM(1, 10, batch_first=True)
  (fc): Linear(in_features=10, out_features=2, bias=True)
)

In [35]:
criterion = nn.CrossEntropyLoss().cuda()

In [36]:
learning_rate = 0.01

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [37]:
len(list(model.parameters()))

6

In [38]:
for i in range(len(list(model.parameters()))):
    print(list(model.parameters())[i].size())

torch.Size([40, 1])
torch.Size([40, 10])
torch.Size([40])
torch.Size([40])
torch.Size([2, 10])
torch.Size([2])


In [39]:
# Number of steps to unroll
seq_dim = 1

iter = 0
for epoch in range(num_epochs):
    for i, (vectors, labels) in enumerate(train_dataset):
        vectors = vectors.requires_grad_()
        vectors = vectors.cuda()
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        
        outputs = model(vectors)
        outputs = outputs.to(device = device)
        outputs = outputs.cuda()
        
        
        labels = labels.cuda()
        labels = labels.to(device = device)
        
        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels).cuda()

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for i, (vectors, labels) in enumerate(test_dataset):
                # Forward pass only to get logits/output
                outputs = model(vectors)
                outputs = outputs.to(device = device)
                outputs = outputs.cuda()

                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (predicted == labels).sum()

            accuracy = 100 * correct / total

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))



Iteration: 500. Loss: 0.6953268647193909. Accuracy: 50
Iteration: 1000. Loss: 0.6922966837882996. Accuracy: 50
Iteration: 1500. Loss: 0.6935392618179321. Accuracy: 50
Iteration: 2000. Loss: 0.6929185390472412. Accuracy: 50
Iteration: 2500. Loss: 0.6928783655166626. Accuracy: 50
Iteration: 3000. Loss: 0.6939647197723389. Accuracy: 50
Iteration: 3500. Loss: 0.6938615441322327. Accuracy: 50
Iteration: 4000. Loss: 0.6934925317764282. Accuracy: 50
Iteration: 4500. Loss: 0.6957478523254395. Accuracy: 50
Iteration: 5000. Loss: 0.6931054592132568. Accuracy: 50
Iteration: 5500. Loss: 0.6921339631080627. Accuracy: 50
Iteration: 6000. Loss: 0.6917015314102173. Accuracy: 50
Iteration: 6500. Loss: 0.6938381195068359. Accuracy: 50
Iteration: 7000. Loss: 0.6945538520812988. Accuracy: 50


KeyboardInterrupt: 