# Importing Libraries

In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch import nn



# Reading Datasets

In [None]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

# Creating Data Tensors

My aim here, is to create embedding for each of the characters, which I want to feed to RNN. I want to train the embeddings for each of the characters.

## Creating dictionaries for Characters

In [None]:
charset = sorted({char for word in words for char in word})
charset = ['start'] + charset + ['.', '!']


print(charset)

itoc = {i:c for i, c in enumerate(charset)}
ctoi = {c:i for i, c in itoc.items()}

vocab_size = len(itoc)

In [None]:
sth = [len(w) for w in words]

import matplotlib.pyplot as plt

# Assuming `sth` is the list you mentioned
plt.boxplot(sth, vert=False)

# Add labels and title
plt.xlabel('Variable')
plt.ylabel('Value')
plt.title('Boxplot of sth')

# Show the plot
plt.show()


## Experimenting diff algorithm to build dataset

In [None]:
maxlen = 8
def create_dataset(words, input_size = maxlen):
	X = []
	Y = []
	setup = [0] * 10
	for word in words:

		for ix in range(len(word)):
			
			last_ix = ix - 1
			setup[-(ix-1):] = word[:ix]

			X.append(setup)
			Y.append(ctoi[word[ix]])
		
	
	tx = torch.tensor(X, dtype=torch.float32)
	ty = torch.tensor(Y, dtype=torch.int32)

	return tx, ty

### V1: this version of create_dataset, involves padding (from left, with 0) to have same maxlen no of characters in each sequence; tad bit inefficient

* Time complexity will be O(n)

In [None]:
word = 'christopher'
#word = 'lucy'
input_size = 8
setup = [0] * input_size

for ix in range(len(word)):
	last_ix = ix - 1
	setup[-(ix-1):] = word[:ix]
	print(f"For ix: {ix}, the setup is: {setup} and target is: {word[ix]}")

### V2: This does it better...no padding required

In [None]:
word = 'christopher'
#word = 'lucy'
input_size = maxlen
setup = [0] * input_size
X = []
Y = []


X.append([0] * maxlen)
Y.append(word[0])
for ix in range(1, len(word)):
	#print(f"For ix: {ix}, pre-assign, setup: {setup[-last_ix:]}, word: {word[:ix]}", end=" ")
	cx = ix
	if ix > maxlen:
		rem = ix % maxlen

		cx = ix - rem
		setup[-cx:] = word[rem:ix]
	else:
		setup[-cx:] = word[:ix]

	X.append(setup.copy())
	Y.append(word[ix])
	#print(f"For ix: {ix}, cx: {cx}, the setup is: {setup}, len of setup: {len(setup)},  and target is: {word[ix]}")

for x, y in zip(X, Y):
	print(x, y)

In [None]:

def create_dataset(words, input_size = maxlen):
	X = []
	Y = []
	
	for word in words:
		print("Processing word: ", word)
		setup = [0] * maxlen
		X.append([0] * maxlen)
		Y.append(ctoi[word[0]])
		for ix in range(1, len(word)):
			cx = ix
			if ix > maxlen:
				rem = ix % maxlen

				cx = ix - rem
				setup[-cx:] = word[rem:ix]
			else:
				setup[-cx:] = word[:ix]

			X.append(setup.copy())
			Y.append(ctoi[word[ix]])
		
	return X, Y
	#tx = torch.tensor(X, dtype=torch.float32)
	#ty = torch.tensor(Y, dtype=torch.int32)

	#return tx, ty

#X, Y = create_dataset(words[:5])

#for x, y in zip(X, Y):
#	print(x, y)

X, Y = create_dataset(words[:5])


### V3: Final version, based on cycling approach

In [None]:
word = 'christopher'
#word = 'lucy'
input_size = maxlen
setup = [0] * input_size
X = []
Y = []

for ix in range(0, len(word)):


	X.append(setup)
	Y.append(word[ix])

	setup = setup[1:] + [word[ix]]
	#print(f"For ix: {ix}, the setup is: {setup}, len of setup: {len(setup)},  and target is: {word[ix]}")

#for x, y in zip(X, Y):
#	print(x, y)

In [None]:

def create_dataset(words, input_size = maxlen):
	X = []
	Y = []
	
	for word in words:
		print("Processing word: ", word)
		setup = [0] * input_size
		X = []
		Y = []

		for ix in range(len(word)):
			X.append(setup)
			Y.append(ctoi[word[ix]])

			setup = setup[1:] + [ctoi[word[ix]]]
		
	#return X, Y
	tx = torch.tensor(X, dtype=torch.float32)
	ty = torch.tensor(Y, dtype=torch.int32)

	return tx, ty


X, Y = create_dataset(words[:5])

for x, y in zip(X, Y):
	print(x, y)

## Dataset builder

In [None]:
word + '.'

In [None]:
def create_dataset(words, input_size = maxlen):
	X = []
	Y = []
	
	for word in words:
		word += '.'
		#print("Processing word: ", word)
		setup = [0] * input_size
		for ix in range(len(word)):
			#print("\t, processing index: ", ix)
			X.append(setup)
			Y.append(ctoi[word[ix]])

			setup = setup[1:] + [ctoi[word[ix]]]
		
	#return X, Y
	tx = torch.tensor(X, dtype=torch.int32)
	ty = torch.tensor(Y, dtype=torch.int32)

	return tx, ty

dx, dy = create_dataset(words)

## Setting up Embedding Layer and feature/target tensors

In [None]:
generator_seed = 19234123
embedding_dim = 100
maxlen = 8 # given boxplot, 8 seems like a decent no

gen = torch.Generator().manual_seed(generator_seed)
embedding_matrix = torch.randn((vocab_size, embedding_dim), generator=gen, requires_grad=True)

In [None]:
feature_tensor = embedding_matrix[dx]
target_tensor = dy

In [None]:
feature_tensor = feature_tensor.to('mps')
target_tensor = target_tensor.to('mps')

In [None]:
print(f"Important check for feature_tensor, shape: {feature_tensor.shape}, dtype: {feature_tensor.dtype}, requires grad: {feature_tensor.requires_grad}")

In [None]:
print(f"Important check for target_tensor, shape: {target_tensor.shape}, dtype: {target_tensor.dtype}, requires grad: {target_tensor.requires_grad}")

# Moving on to Model Building

In [None]:
feature_tensor.size()

## Phase 1: Using nn.Module (keeping things tad bit simple)

In [None]:
hidden_dim = 256

In [None]:
class Tanh(nn.Module):
	def __init__(self):
		super(Tanh, self).__init__()

		
	def forward(self, x):
		return (torch.exp(2 * x) - 1) / (torch.exp(2 * x) + 1)



class Softmax(nn.Module):

	def __init__(self):
		super(Softmax, self).__init__()

	def forward(self, logits):

		logits_max = logits.max(dim=1, keepdim=True)
		logits_norm = logits - logits_max[0]

		exp_logits = logits_norm.exp()
		exp_logits_sum = exp_logits.sum(1, keepdim=True)

		return exp_logits/exp_logits_sum





class rnn(nn.Module):

	def __init__(self, embedding_dim, vocab_size = vocab_size, hidden_dim = 256):
		super(rnn, self).__init__()

		self.embedding_dim = embedding_dim
		self.hidden_dim = hidden_dim
		self.vocab_size = vocab_size

		self.tanh = Tanh()
		self.softmax = Softmax()


		#self.wxh = nn.Parameter(torch.randn(self.embedding_dim, self.hidden_dim))
		self.wxh = nn.Parameter(torch.nn.init.kaiming_normal_(torch.empty((self.embedding_dim, self.hidden_dim)), nonlinearity='linear'))
		self.bh = nn.Parameter(torch.randn(1, self.hidden_dim))

		#self.whh = nn.Parameter(torch.randn(self.hidden_dim, self.hidden_dim))
		self.whh = nn.Parameter(torch.nn.init.kaiming_normal_(torch.empty((self.hidden_dim, self.hidden_dim)), nonlinearity='linear'))

		#self.who = nn.Parameter(torch.randn(self.hidden_dim, self.vocab_size))
		self.who = nn.Parameter(torch.nn.init.kaiming_normal_(torch.empty((self.hidden_dim, self.vocab_size)), nonlinearity='linear'))
		self.bo = nn.Parameter(torch.randn(1, self.vocab_size))

	def forward(self, x, hidden_state_prev=None):

		if hidden_state_prev is None:
			hidden_state_prev = torch.zeros(x.shape[0], self.hidden_dim, device=x.device)

		inp_hid = x @ self.wxh + hidden_state_prev @ self.whh + self.bh
		hidden_state = self.tanh(inp_hid)

		inp_out = hidden_state @ self.who + self.bo
		output = self.softmax(inp_out)

		return output, hidden_state



def categorical_cce(y_pred, y_true):
	log_probs = torch.log(y_pred[torch.arange(len(y_pred)), y_true])
	loss = -log_probs.mean()
	return loss



## Training Phase 1 model

In [None]:
epochs = 50

myrnn = rnn(embedding_dim, hidden_dim=hidden_dim).to('mps')
myrnn.train()

myrnn.zero_grad()
feature_tensor.device

def grad_hook(grad):
    print("Gradient at custom activation:", grad)


for epoch in range(epochs):

	x = embedding_matrix[dx].to('mps')
	

	hidden_state_prev = torch.zeros(x.shape[0], hidden_dim, device=x.device)
	hidden_state = hidden_state_prev
	
	for t in range(maxlen):
		output, hidden_state = myrnn(x[:, t, :], hidden_state.detach())

		#output.register_hook(grad_hook)

	
	loss = categorical_cce(output, target_tensor)

	loss.backward()

	with torch.no_grad():
		for param in myrnn.parameters():
			param.data = param.data - 0.01 * param.grad
	
	myrnn.zero_grad()

	print(f"Epoch: {epoch}, loss: {loss:.3f}")
	
		

	

# Sampling from trained model

In [None]:
model_input = torch.tensor([0] * 1)
embedding_input = embedding_matrix[model_input].view(1, 100).to('mps')
myrnn.eval()

hidden_state = torch.zeros(1, hidden_dim, device='mps')
pred = ''


for names in range(10):
	
	for _ in range(10):

		output, hidden_state = myrnn(embedding_input, hidden_state)
		#argmax_output = torch.argmax(output, dim=1).to('cpu').item()
		argmax_output = torch.multinomial(output, num_samples=1).to('cpu').item()

		pred += itoc[argmax_output]
		if argmax_output == 27:
			break

		model_input = torch.tensor([argmax_output])
		embedding_input = embedding_matrix[model_input].view(1, 100).to('mps')

	model_input = torch.tensor([0] * 1)
	embedding_input = embedding_matrix[model_input].view(1, 100).to('mps')

	print(pred)

In [2]:
len(set([103290, 103290, 391373, 391373, 408119, 408119, 469466, 469466, 488411, 488411, 535362, 535362, 538556, 538556, 600205, 600205, 609986, 609986, 636717, 636717, 651283, 651283, 689223, 689223, 696804, 696804, 699343, 699343, 728975, 728975, 792127, 792127, 794462, 794462, 814484, 814484, 821085, 821085, 828007, 828007, 833408, 833408, 840495, 840495, 2674440, 2674440, 2674750, 2674750, 2679101, 2679101, 2679164, 2679164, 2690521, 2690521, 2696644, 2696644, 2700803, 2700803, 2707116, 2707116, 2710708, 2710708, 2757225, 2757225, 2935358, 2935358]))

33