In [20]:
## Authors: Raja Batra And Eli Rejto
## October 19, 2023

In [21]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
import re

# Part 1: Basic System with fixed-length inputs

### Convert text to lowercase and remove all punctuation except “.” so the data only contains alphabet characters, whitespace, and periods

In [22]:
def englishsplitintosentences(text):
    sentences = re.split(r'=', text)
    return sentences
    
def spanishsplitintosentences(text):
    sentences = re.findall(r'\*(.*?)\#', text)
    return sentences

def preprocesssentence(sentence):
    sentence = sentence.lower()  # Convert to lowercase
    sentence = ''.join(char if char.isalpha() or char.isspace() or char == '.' else ' ' for char in sentence)  # Remove punctuation
    return sentence

In [23]:
# Apply preprocessing to all English sentences
english_file_path = 'dataforbuildingmodel/someenglishtext'
with open(english_file_path, 'r', encoding='latin-1') as file:
    english_text = file.read()

english_sentences = englishsplitintosentences(english_text)


english_sentences = [preprocesssentence(sentence) for sentence in english_sentences]


In [24]:
print(english_sentences[0:3])

[' \n ', ' valkyria chronicles iii ', ' \n \n senjå  no valkyria      unk  chronicles   japanese   æ  å\xa0 ã  ã  ã  ã  ã  ã  ã ªã      lit . valkyria of the battlefield       commonly referred to as valkyria chronicles iii outside japan   is a tactical role     playing video game developed by sega and media.vision for the playstation portable . released in january      in japan   it is the third game in the valkyria series . employing the same fusion of tactical and real     time gameplay as its predecessors   the story runs parallel to the first game and follows the   nameless     a penal military unit serving the nation of gallia during the second europan war who perform secret black operations and are pitted against the imperial unit    unk  raven   . \n the game began development in        carrying over a large portion of the work done on valkyria chronicles ii . while it retained the standard features of the series   it also underwent multiple adjustments   such as making the gam

In [25]:
# Apply preprocessing to all Spanish sentences
spanish_file_path = 'dataforbuildingmodel/somespanishtext'
with open(spanish_file_path, 'r', encoding='latin-1') as file:
    spanish_text = file.read()
spanish_sentences = spanishsplitintosentences(spanish_text)


spanish_sentences = [preprocesssentence(sentence) for sentence in spanish_sentences]

    

In [26]:
processedspanishtext = ' '.join(spanish_sentences)

processedenglishtext = ' '.join(english_sentences)

### Determine a set of unique characters and map all characters to integers

In [27]:
unique_chars = sorted(set(processedenglishtext + processedspanishtext))

In [28]:
print(unique_chars)

['\n', ' ', '.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', 'ª', 'µ', 'á', 'â', 'ã', 'å', 'æ', 'î', 'ï']


### map characters to integers

In [29]:
chardict = {char: idx for idx, char in enumerate(unique_chars)}


def texttoint(text, chardict):
    mappedtext = []
    for char in text:
        mappedtext.append(chardict[char])
    return mappedtext

mappedspanishtext = texttoint(processedspanishtext, chardict)
mappedspanishlabels = [1] * len(mappedspanishtext)
mappedenglishtext = texttoint(processedenglishtext, chardict)

mappedenlishlabels = [0] * len(mappedenglishtext)
  

In [30]:
print(mappedenglishtext[0:100])
print(mappedenglishtext[100:200])

[1, 0, 1, 1, 1, 24, 3, 14, 13, 27, 20, 11, 3, 1, 5, 10, 20, 17, 16, 11, 5, 14, 7, 21, 1, 11, 11, 11, 1, 1, 1, 0, 1, 0, 1, 21, 7, 16, 12, 35, 1, 1, 16, 17, 1, 24, 3, 14, 13, 27, 20, 11, 3, 1, 1, 1, 1, 1, 1, 23, 16, 13, 1, 1, 5, 10, 20, 17, 16, 11, 5, 14, 7, 21, 1, 1, 1, 12, 3, 18, 3, 16, 7, 21, 7, 1, 1, 1, 36, 1, 1, 35, 29, 1, 34, 1, 1, 34, 1, 1]
[34, 1, 1, 34, 1, 1, 34, 1, 1, 34, 1, 1, 34, 1, 30, 34, 1, 1, 1, 1, 1, 1, 14, 11, 22, 1, 2, 1, 24, 3, 14, 13, 27, 20, 11, 3, 1, 17, 8, 1, 22, 10, 7, 1, 4, 3, 22, 22, 14, 7, 8, 11, 7, 14, 6, 1, 1, 1, 1, 1, 1, 1, 5, 17, 15, 15, 17, 16, 14, 27, 1, 20, 7, 8, 7, 20, 20, 7, 6, 1, 22, 17, 1, 3, 21, 1, 24, 3, 14, 13, 27, 20, 11, 3, 1, 5, 10, 20, 17, 16]


### splitting into training and validation data and making chunks within data

In [31]:
chunk_length = 100  
#text_to_int_sequence(train_english[i:i+chunk_length], char_to_int) for i in range(0, len(train_english), chunk_length)
trainenglishchunk = [mappedenglishtext[i:i+chunk_length] for i in range(0, len(mappedenglishtext), chunk_length)]
trainingenglishlabels = [0] * len(trainenglishchunk)
trainspanishchunk = [mappedspanishtext[i:i+chunk_length] for i in range(0, len(mappedspanishtext), chunk_length)]
trainingspanishlabels = [1] * len(trainspanishchunk)



In [46]:
df1 = pd.DataFrame({'chunk': trainenglishchunk, 'language': 0})

df2 = pd.DataFrame({'chunk': trainspanishchunk, 'language': 1})

# Concatenate the two DataFrames
combined_df = pd.concat([df1, df2], ignore_index=True)


#Split Training and Validation Data
Training_Fraction = 0.8

training_df = combined_df.sample(frac = Training_Fraction)

valid_df = combined_df.drop(training_df.index)


In [50]:
train_loader = DataLoader(training_df, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_df, batch_size=batch_size, shuffle=False)

## Train 1 Layer Model

In [51]:
#loosely based on https://www.kaggle.com/code/mehmetlaudatekman/lstm-text-classification-pytorch
class LanguageClassificationModel(nn.Module):
    def __init__(self, input_size, embeddingdim, hidden_size, num_classes):
        super(LanguageClassificationModel, self).__init__()
        self.embedding = nn.Embedding(input_size, embeddingdim)
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, num_classes)
        

  

    def forward(self, x):
        
        embedded = self.embedding(x)
        embedded_swapped = torch.swapaxes(embedded,0,1)
        lstm_out, state_out = self.lstm(embedded_swapped)
        output = self.linear(lstm_out[:, -1, :])
        output_swapped = torch.swapaxes(output,0,1)

        return output_swapped,state_out

In [52]:
# hyperparameters for parameter
input_size = len(chardict)  
hidden_size = 32
num_classes = 2  # English and Spanish
learning_rate = 0.001
dropout = 0.8
batch_size = 32
embedding_dim = 100
num_epochs = 10
#TensorDataset(torch.stack(train_sequences), train_labels)



In [53]:
model = LanguageClassificationModel(input_size, embedding_dim, hidden_size, num_classes)

In [60]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [61]:
print(train_loader.data[1])

AttributeError: 'DataLoader' object has no attribute 'data'

In [56]:
# for batch_x, batch_y in train_loader:
#     # batch_x is a batch of data
#     # batch_y is a batch of labels
#     print("Batch of data:")
#     print(batch_x)
#     print("Batch of labels:")
#     print(batch_y)
#     print("Shape of data batch:", batch_x.shape)
#     print("Shape of labels batch:", batch_y.shape)

In [57]:
for epoch in range(num_epochs):
    model.train()
  
    running_loss = 0.0
    for batch_x, batch_y in train_loader:

        
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Print the average loss for the epoch
    print(f'Epoch [{epoch + 1}/{num_epochs}] - Loss: {running_loss / len(train_data_loader)}')

print('Finished Training')

KeyError: 3362