In [1]:
import pandas as pd

# Load your CSV file
df = pd.read_csv('asmm_engg.csv')

# Random sampling
reduced_df = df.sample(n=5000, random_state=1)  # Adjust random_state for reproducibility

# Save the reduced dataset to a new CSV file
reduced_df.to_csv('asm_eng.csv', index=False)


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from nltk.translate.bleu_score import corpus_bleu

# Load and preprocess the data
data = pd.read_csv('asm_eng.csv')
data.dropna(inplace=True)

# Data analysis
plt.figure(figsize=(8, 6))
sns.histplot(data['eng'].str.len(), bins=20, kde=True)
plt.xlabel('English Sentence Length')
plt.ylabel('Count')
plt.title('Distribution of English Sentence Lengths')
plt.show()

# Tokenization and vocabulary creation
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(data['eng'])
vocab_size_eng = len(tokenizer_eng.word_index) + 1

tokenizer_asm = Tokenizer()
tokenizer_asm.fit_on_texts(data['asm'])
vocab_size_asm = len(tokenizer_asm.word_index) + 1

# Convert sentences to sequences
max_length = 50
X = tokenizer_eng.texts_to_sequences(data['eng'])
X = pad_sequences(X, maxlen=max_length, padding='post')

y = tokenizer_asm.texts_to_sequences(data['asm'])
y = pad_sequences(y, maxlen=max_length, padding='post')

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the LSTM model
embedding_dim = 100
model = Sequential()
model.add(Embedding(vocab_size_eng, embedding_dim, input_length=max_length))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(vocab_size_asm, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the model
batch_size = 64
epochs = 10
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

# Inference and translation
def translate_sentence(sentence):
    sequence = tokenizer_eng.texts_to_sequences([sentence])
    sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    prediction = model.predict(sequence)
    predicted_sentence = ' '.join([tokenizer_asm.index_word[idx] for idx in np.argmax(prediction, axis=1)])
    return predicted_sentence

# Example usage
english_sentence = "I love learning new things."
assamese_translation = translate_sentence(english_sentence)
print(f'English: {english_sentence}')
print(f'Assamese: {assamese_translation}')

ModuleNotFoundError: No module named 'keras.preprocessing.text'

In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv('asm_eng.csv')


In [4]:
# Check the first few rows of the dataframe
print(df.head())

# Check the shape of the dataframe
print("Shape of the dataframe:", df.shape)

# Check for any missing values
print("Missing values:")
print(df.isnull().sum())


                                                 asm  \
0  এই বিলায়কবোৰ ব্যৱহাৰ কৰাৰ আগতে গন্ধকৰ অম্লৰে ...   
1  ৰাজ পৰিয়ালৰ ৰাণীসকলৰ বসবাসৰ কাৰণে সপ্তদশ শতাব...   
2  পানীৰ অভাৱত এই টক্সিন নিজৰ সম্পূৰ্ণ প্ৰভাৱ দেখ...   
3  অটনোমিক ন্যুৰোপেথীত ঘাম নাহে , ৰক্ত প্ৰবাহ প্ৰ...   
4  এই উপলক্ষে মাহজুৰি সমগ্ৰ য়ুনানত অবিভাজ্য শান্ত...   

                                                 eng  
0  before using  these solvents  they  are treate...  
1  the vilaas temple  was built  in th century  f...  
2  in water deficiency  these toxins  show  their...  
3  you  do not sweat  in autonomic neuropathy ,  ...  
4  on this ocassion  unbroken peace  was used  to...  
Shape of the dataframe: (5000, 2)
Missing values:
asm    0
eng    0
dtype: int64


In [5]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets (e.g., 80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Reset index for both dataframes
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print("Training data shape:", train_df.shape)
print("Testing data shape:", test_df.shape)


Training data shape: (4000, 2)
Testing data shape: (1000, 2)


In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# Load the dataset
df = pd.read_csv('asm_eng.csv')

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['eng'])
train_sequences = tokenizer.texts_to_sequences(train_df['eng'])
test_sequences = tokenizer.texts_to_sequences(test_df['eng'])

# Pad sequences
max_length = 20
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Convert labels to integers
train_labels = train_df['asm'].astype(int)
test_labels = test_df['asm'].astype(int)

# Define model architecture
vocab_size = len(tokenizer.word_index) + 1
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(LSTM(256))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(train_padded, train_labels, epochs=20, batch_size=64, validation_data=(test_padded, test_labels))

# Evaluate the model
loss, accuracy = model.evaluate(test_padded, test_labels)
print("Test Accuracy:", accuracy)


ValueError: invalid literal for int() with base 10: 'কিছুমান লোকে এই ৰোগক মানসিক স্বাস্থ্যৰ কেঞ্চাৰ হিচাপেও গণ্য কৰে ।'

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# Load the dataset
df = pd.read_csv('asm_eng.csv')

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Tokenize English text
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(train_df['eng'])
vocab_size_eng = len(tokenizer_eng.word_index) + 1

train_sequences_eng = tokenizer_eng.texts_to_sequences(train_df['eng'])
test_sequences_eng = tokenizer_eng.texts_to_sequences(test_df['eng'])

max_length_eng = max(len(seq) for seq in train_sequences_eng)
train_padded_eng = pad_sequences(train_sequences_eng, maxlen=max_length_eng, padding='post')
test_padded_eng = pad_sequences(test_sequences_eng, maxlen=max_length_eng, padding='post')

# Tokenize Assamese text
tokenizer_asm = Tokenizer()
tokenizer_asm.fit_on_texts(train_df['asm'])
vocab_size_asm = len(tokenizer_asm.word_index) + 1

train_sequences_asm = tokenizer_asm.texts_to_sequences(train_df['asm'])
test_sequences_asm = tokenizer_asm.texts_to_sequences(test_df['asm'])

max_length_asm = max(len(seq) for seq in train_sequences_asm)
train_padded_asm = pad_sequences(train_sequences_asm, maxlen=max_length_asm, padding='post')
test_padded_asm = pad_sequences(test_sequences_asm, maxlen=max_length_asm, padding='post')

# Define model architecture
model = Sequential()
model.add(Embedding(input_dim=vocab_size_eng, output_dim=100, input_length=max_length_eng))
model.add(LSTM(256))
model.add(Dense(vocab_size_asm, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
print(model.summary())

# Train the model
history = model.fit(train_padded_eng, train_padded_asm, epochs=20, batch_size=64, validation_data=(test_padded_eng, test_padded_asm))

# Evaluate the model
loss, accuracy = model.evaluate(test_padded_eng, test_padded_asm)
print("Test Accuracy:", accuracy)




None
Epoch 1/20


ValueError: Argument `output` must have rank (ndim) `target.ndim - 1`. Received: target.shape=(None, 55), output.shape=(None, 14554)

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# Load the dataset
df = pd.read_csv('asm_eng.csv')

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Tokenize English text
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(train_df['eng'])
vocab_size_eng = len(tokenizer_eng.word_index) + 1

train_sequences_eng = tokenizer_eng.texts_to_sequences(train_df['eng'])
test_sequences_eng = tokenizer_eng.texts_to_sequences(test_df['eng'])

max_length_eng = max(len(seq) for seq in train_sequences_eng)
train_padded_eng = pad_sequences(train_sequences_eng, maxlen=max_length_eng, padding='post')
test_padded_eng = pad_sequences(test_sequences_eng, maxlen=max_length_eng, padding='post')

# Tokenize Assamese text
tokenizer_asm = Tokenizer()
tokenizer_asm.fit_on_texts(train_df['asm'])
vocab_size_asm = len(tokenizer_asm.word_index) + 1

train_sequences_asm = tokenizer_asm.texts_to_sequences(train_df['asm'])
test_sequences_asm = tokenizer_asm.texts_to_sequences(test_df['asm'])

max_length_asm = max(len(seq) for seq in train_sequences_asm)
train_padded_asm = pad_sequences(train_sequences_asm, maxlen=max_length_asm, padding='post')
test_padded_asm = pad_sequences(test_sequences_asm, maxlen=max_length_asm, padding='post')

# Define model architecture
model = Sequential()
model.add(Embedding(input_dim=vocab_size_eng, output_dim=100, input_length=max_length_eng))
model.add(LSTM(256, return_sequences=True))  # return_sequences=True for sequence-to-sequence
model.add(Dense(vocab_size_asm, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
print(model.summary())

# Train the model
history = model.fit(train_padded_eng, train_padded_asm, epochs=20, batch_size=64, validation_data=(test_padded_eng, test_padded_asm))

# Evaluate the model
loss, accuracy = model.evaluate(test_padded_eng, test_padded_asm)
print("Test Accuracy:", accuracy)




None
Epoch 1/20


ValueError: Arguments `target` and `output` must have the same shape up until the last dimension: target.shape=(None, 55), output.shape=(None, 65, 14554)

In [13]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
data = pd.read_csv("asm_eng.csv")
english_sentences = data["eng"].tolist()
asm_sentences = data["asm"].tolist()
asm_sentences = [str(sentence) for sentence in asm_sentences]
english_sentences = [str(sentence) for sentence in english_sentences]

In [14]:
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
eng_seq = tokenizer_eng.texts_to_sequences(english_sentences)

tokenizer_asm = Tokenizer()
tokenizer_asm.fit_on_texts(asm_sentences)
asm_seq = tokenizer_asm.texts_to_sequences(asm_sentences)

vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_asm = len(tokenizer_asm.word_index) + 1

# Padding
max_length = max(len(seq) for seq in eng_seq + asm_seq)
eng_seq_padded = pad_sequences(eng_seq, maxlen=max_length, padding='post')
asm_seq_padded = pad_sequences(asm_seq, maxlen=max_length, padding='post')

In [15]:
embedding_dim = 256
units = 512

# Encoder
encoder_inputs = Input(shape=(max_length,))
enc_emb = Embedding(input_dim=vocab_size_eng, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_length,))
dec_emb_layer = Embedding(input_dim=vocab_size_asm, output_dim=embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_asm, activation='softmax')
output = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [18]:
X_train, X_val, y_train, y_val = train_test_split(eng_seq_padded, asm_seq_padded, test_size=0.2)
model.fit([X_train, X_train], y_train, validation_data=([X_val, X_val], y_val), epochs=10, batch_size=64)

Epoch 1/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 5s/step - accuracy: 0.7943 - loss: 1.9486 - val_accuracy: 0.7956 - val_loss: 1.8031
Epoch 2/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 5s/step - accuracy: 0.7970 - loss: 1.7339 - val_accuracy: 0.7959 - val_loss: 1.8100
Epoch 3/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m343s[0m 5s/step - accuracy: 0.7996 - loss: 1.6873 - val_accuracy: 0.7959 - val_loss: 1.8128
Epoch 4/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 5s/step - accuracy: 0.7980 - loss: 1.6724 - val_accuracy: 0.7944 - val_loss: 1.8218
Epoch 5/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m302s[0m 5s/step - accuracy: 0.7957 - loss: 1.6717 - val_accuracy: 0.7949 - val_loss: 1.8354
Epoch 6/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m302s[0m 5s/step - accuracy: 0.7964 - loss: 1.6445 - val_accuracy: 0.7950 - val_loss: 1.8625
Epoch 7/10
[1m63/63[0m [32m━━━━

<keras.src.callbacks.history.History at 0x23b286690d0>

In [None]:
def translate_sentence(sentence):
    seq = tokenizer_eng.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    translated = np.argmax(model.predict([padded, padded]), axis=-1)
    
    translated_sentence = []
    for i in translated[0]:
        if i in tokenizer_asm.index_word:
            translated_sentence.append(tokenizer_asm.index_word[i])
        else:
            translated_sentence.append(' ') 
        
    return ' '.join(translated_sentence)
while True:
    input_sentence = input()
    translated_sentence = translate_sentence(input_sentence)
    print(f"Input: {input_sentence}")
    print(f"Translated: {translated_sentence}")

 from here on


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step
Input: from here on
Translated: এই পৰা পৰা ।                                                                                                                          


 pain


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
Input: pain
Translated: এই । ।                                                                                                                            
