<a href="https://colab.research.google.com/github/suhasamane1101/DL_Google_Colab/blob/main/Language_Identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()

Saving dummy_language_identification.csv to dummy_language_identification.csv


In [4]:
# Step 1: Install Required Libraries
!pip install tensorflow numpy pandas scikit-learn matplotlib



In [5]:
# Step 2: Import Libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 3: Load the Dataset
data_path = '/content/dummy_language_identification.csv'  # Make sure to use the correct path to your CSV file
df = pd.read_csv(data_path)

# Check the structure of the DataFrame
print("Dataset preview:")
print(df.head())  # Show the first few rows
print("Columns in the dataset:", df.columns)  # Print the column names

# Step 4: Preprocess the Data
# Encoding the languages
label_encoder = LabelEncoder()
df['language_encoded'] = label_encoder.fit_transform(df['language'])

# Prepare text data
max_length = 50  # Maximum length of the text sequences
X = df['text_content'].values
y = df['language_encoded'].values

# Tokenization and padding
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=max_length)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Step 5: Build the Model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Output layer

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 6: Train the Model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# Step 7: Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {accuracy * 100:.2f}%")

# Step 8: Make Predictions
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)

# Display the predicted language for each test sample
for i in range(len(X_test)):
    print(f"Text: {X_test[i]} - Predicted Language: {label_encoder.inverse_transform([predicted_classes[i]])[0]}")


Dataset preview:
   text_id                                       text_content language
0        1             Dies ist ein Beispielsatz auf Deutsch.   German
1        2                  This is a sample text in English.  English
2        3  This is a sample text in English.This is a sam...  English
3        4  Este es un texto de ejemplo en español.Este es...  Spanish
4        5  Este es un texto de ejemplo en español.Este es...  Spanish
Columns in the dataset: Index(['text_id', 'text_content', 'language'], dtype='object')
Epoch 1/5




[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 343ms/step - accuracy: 0.2195 - loss: 1.3796 - val_accuracy: 0.8000 - val_loss: 1.3416
Epoch 2/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 108ms/step - accuracy: 0.8617 - loss: 1.3154 - val_accuracy: 0.9500 - val_loss: 1.2623
Epoch 3/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 99ms/step - accuracy: 0.9031 - loss: 1.1798 - val_accuracy: 0.9500 - val_loss: 1.1026
Epoch 4/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - accuracy: 0.9148 - loss: 0.9311 - val_accuracy: 0.9500 - val_loss: 0.8570
Epoch 5/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - accuracy: 0.9109 - loss: 0.6482 - val_accuracy: 1.0000 - val_loss: 0.5282
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 1.0000 - loss: 0.5282

Test Accuracy: 100.00%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 307ms/step
Text: [ 0  0  