In [80]:
## Authors: Raja Batra And Eli Rejto
## October 19, 2023

In [165]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import re

# Part 1: Basic System with fixed-length inputs

### Convert text to lowercase and remove all punctuation except “.” so the data only contains alphabet characters, whitespace, and periods

In [24]:
def englishsplitintosentences(text):
    sentences = re.split(r'=', text)
    return sentences
    
def spanishsplitintosentences(text):
    sentences = re.findall(r'\*(.*?)\#', text)
    return sentences

def preprocesssentence(sentence):
    sentence = sentence.lower()  # Convert to lowercase
    sentence = ''.join(char if char.isalpha() or char.isspace() or char == '.' else ' ' for char in sentence)  # Remove punctuation
    return sentence

In [30]:
# Apply preprocessing to all English sentences
english_file_path = 'dataforbuildingmodel/someenglishtext'
with open(english_file_path, 'r', encoding='latin-1') as file:
    english_text = file.read()

english_sentences = splitintosentences(english_text)


english_sentences = [preprocesssentence(sentence) for sentence in english_sentences]


In [31]:
print(english_sentences[0:3])

[' \n ', ' valkyria chronicles iii ', ' \n \n senjå  no valkyria      unk  chronicles   japanese   æ  å\xa0 ã  ã  ã  ã  ã  ã  ã ªã      lit . valkyria of the battlefield       commonly referred to as valkyria chronicles iii outside japan   is a tactical role     playing video game developed by sega and media.vision for the playstation portable . released in january      in japan   it is the third game in the valkyria series . employing the same fusion of tactical and real     time gameplay as its predecessors   the story runs parallel to the first game and follows the   nameless     a penal military unit serving the nation of gallia during the second europan war who perform secret black operations and are pitted against the imperial unit    unk  raven   . \n the game began development in        carrying over a large portion of the work done on valkyria chronicles ii . while it retained the standard features of the series   it also underwent multiple adjustments   such as making the gam

In [37]:
# Apply preprocessing to all Spanish sentences
spanish_file_path = 'dataforbuildingmodel/somespanishtext'
with open(spanish_file_path, 'r', encoding='latin-1') as file:
    spanish_text = file.read()
spanish_sentences = spanishsplitintosentences(spanish_text)


spanish_sentences = [preprocess_sentence(sentence) for sentence in spanish_sentences]

    

In [39]:
print(spanish_sentences[0:10])

['la enciclopedia libre jorge hess de wikipedia', 'la enciclopedia libre saltar a jorge hess de julio es un y cofundador de la liga argentina de esperanto hess escribiï   un manual para el aprendizaje de esperanto que fue editado por primera vez en y se titula sabe usted esperanto', 'es uno de los mï  s conocidos libros en espaï  ol que tratan sobre el tema junto con curso prï  ctico de esperanto ferenc szilï  gyi', 'el cual hess adaptï   para los en', 'jorge hess tambiï  n compilï   la obra papeles de wappers y fue algunas veces redactor de argentina esperantisto una antigua revista mensual', 'sobre su libro sabe usted esperanto estï   diseï  ado para el de dicho idioma a la manera tradicional', 'lecciones para repetir en voz alta capï  tulos breves con preguntas e ilustraciones de carlos wappers', 'pero sin demasiadas explicaciones gramaticales', 'es un curso bï  sico para principiantes', 'los ejercicios son atrayentes']


In [46]:
processedspanishtext = ' '.join(spanish_sentences)

processedenglishtext = ' '.join(english_sentences)

### Determine a set of unique characters and map all characters to integers

In [48]:
unique_chars = sorted(set(processedenglishtext + processedspanishtext))

In [49]:
print(unique_chars)

['\n', ' ', '.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', 'ª', 'µ', 'á', 'â', 'ã', 'å', 'æ', 'î', 'ï']


### map characters to integers

In [110]:

chardict = {char: idx for idx, char in enumerate(unique_chars)}


def texttoint(text, chardict):
    mappedtext = []
    for char in text:
        mappedtext.append(chardict[char])
    return mappedtext

mappedspanishtext = texttoint(processedspanishtext, chardict)
mappedenglishtext = texttoint(processedenglishtext, chardict)
# Create labels for English and Spanish data
#english_labels = [0] * len(mappedenglishtext)
#spanish_labels = [1] * len(mappedspanishtext)

##combine all data
#mappedtext = mappedspanishtext + mappedenglishtext
#alllabels = english_labels + spanish_labels
#print(len(english_labels))
#print(len(mappedenglishtext))
  

361048
361048


### splitting into training and validation data and making chunks within data

In [None]:
chunk_length = 100  
trainenglishchunk = [mappedenglishtext for i in range(0, len(train_english), chunk_length)]
trainingenglishlabels = [0] * len(trainenglishchunk)
trainspanishchunk = [mappedenglishtext for i in range(0, len(train_spanish), chunk_length)]
trainingspanishlabels = [1] * len(trainspanishchunk)

valenglishchunk = [mappedenglishtext for i in range(0, len(val_english), chunk_length)]
valenglishlabel = [0] * len(valenglishchunk)
valspanishchunk = [mappedenglishtext for i in range(0, len(val_spanish), chunk_length)]
valspanishlabel = [1] * len(valspanishchunk)

In [190]:


train_data = trainenglishchunk + trainspanishchunk



train_labels = trainingenglishlabels + trainingspanishlabels





val_data = valenglishchunk+valspanishchunk
train_labels = valenglishlabel+valspanishlabel



In [201]:
train_loader = DataLoader((train_data, train_labels), batch_size=batch_size, shuffle=True)
valid_loader = DataLoader((val_data, train_data), batch_size=batch_size, shuffle=False)

## Train 1 Layer Model

In [195]:
#loosely based on https://www.kaggle.com/code/mehmetlaudatekman/lstm-text-classification-pytorch
class LanguageClassificationModel(nn.Module):
    def __init__(self, input_size, embeddingdim, hidden_size, num_classes):
        super(LanguageClassificationModel, self).__init__()
        self.embedding = nn.Embedding(input_size, embeddingdim)
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, num_classes)
        

  

    def forward(self, x):
        
        embedded = self.embedding(x)
        embedded_swapped = torch.swapaxes(embedded,0,1)
        lstm_out, state_out = self.lstm(embedded_swapped)
        output = self.linear(lstm_out[:, -1, :])
        output_swapped = torch.swapaxes(output,0,1)

        return output_swapped,state_out

In [196]:
# hyperparameters for parameter
input_size = len(chardict)  
hidden_size = 32
num_classes = 2  # English and Spanish
learning_rate = 0.001
dropout = 0.8
batch_size = 32
embedding_dim = 100
num_epochs = 10
#TensorDataset(torch.stack(train_sequences), train_labels)



In [154]:
model = LanguageClassificationModel(input_size, embedding_dim, hidden_size, num_classes)

In [155]:
print(model)

LanguageClassificationModel(
  (lstm): LSTM(39, 32, batch_first=True)
  (linear): Linear(in_features=32, out_features=2, bias=True)
)


In [197]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print(train_loader)

<torch.utils.data.dataloader.DataLoader object at 0x7fb8cd4cead0>


In [206]:
for batch_x, batch_y in train_loader:
    # batch_x is a batch of data
    # batch_y is a batch of labels
    print("Batch of data:")
    print(batch_x)
    print("Batch of labels:")
    print(batch_y)
    print("Shape of data batch:", batch_x.shape)
    print("Shape of labels batch:", batch_y.shape)

RuntimeError: each element in list of batch should be of equal size

In [204]:
for epoch in range(num_epochs):
    model.train()
  
    running_loss = 0.0
    for batch_x, batch_y in train_loader:
        
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Print the average loss for the epoch
    print(f'Epoch [{epoch + 1}/{num_epochs}] - Loss: {running_loss / len(train_data_loader)}')

print('Finished Training')

AttributeError: 'DataLoader' object has no attribute 'shape'