In [1]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install tensorflow
!pip install scikit-learn
!pip install seaborn


Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/80/6f/57d36f6507e432d7fc1956b2e9e8530c5c2d2bfcd8821bcbfae271cd6688/tensorflow-2.14.0-cp311-cp311-win_amd64.whl.metadata
  Using cached tensorflow-2.14.0-cp311-cp311-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.14.0 (from tensorflow)
  Obtaining dependency information for tensorflow-intel==2.14.0 from https://files.pythonhosted.org/packages/ad/6e/1bfe367855dd87467564f7bf9fa14f3b17889988e79598bc37bf18f5ffb6/tensorflow_intel-2.14.0-cp311-cp311-win_amd64.whl.metadata
  Using cached tensorflow_intel-2.14.0-cp311-cp311-win_amd64.whl.metadata (4.8 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.14.0->tensorflow)
  Obtaining dependency information for absl-py>=1.0.0 from https://files.pythonhosted.org/packages/01/e4/dc0a1dcc4e74e08d7abedab278c795eef54a224363bb18f5692f416d834f/absl_py-2.0.0-py3-none-any.whl.metadata
  Using cached absl_py-2.0.0-py3-none-

   --------------------- ---------------- 162.2/284.2 MB 654.0 kB/s eta 0:03:07
   --------------------- ---------------- 162.3/284.2 MB 652.1 kB/s eta 0:03:07
   --------------------- ---------------- 162.3/284.2 MB 651.5 kB/s eta 0:03:08
   --------------------- ---------------- 162.3/284.2 MB 650.1 kB/s eta 0:03:08
   --------------------- ---------------- 162.3/284.2 MB 650.2 kB/s eta 0:03:08
   --------------------- ---------------- 162.4/284.2 MB 647.6 kB/s eta 0:03:09
   --------------------- ---------------- 162.4/284.2 MB 646.9 kB/s eta 0:03:09
   --------------------- ---------------- 162.4/284.2 MB 646.3 kB/s eta 0:03:09
   --------------------- ---------------- 162.5/284.2 MB 644.4 kB/s eta 0:03:09
   --------------------- ---------------- 162.5/284.2 MB 643.7 kB/s eta 0:03:10
   --------------------- ---------------- 162.5/284.2 MB 644.4 kB/s eta 0:03:09
   --------------------- ---------------- 162.5/284.2 MB 643.1 kB/s eta 0:03:10
   --------------------- ---------------



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc
import seaborn as sns
import logging
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Configure logging
logging.basicConfig(filename='training.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load your Shahmukhi Punjabi dataset from the Excel file
file_path = 'concatenated_data.xlsx'
df = pd.read_excel(file_path)

# Assuming your dataset has a 'Shahmukhi' column containing Shahmukhi Punjabi text
corpus = df['Shahmukhi'].tolist()

# Filter out non-string values
corpus = [text for text in corpus if isinstance(text, str)]

# Split data into training and validation sets
X_train, X_valid = train_test_split(corpus, test_size=0.2, random_state=42)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)  # Fit tokenizer on training data
total_words = len(tokenizer.word_index) + 1

# Create training data (CBOW context and target pairs)
input_sequences = []
for line in X_train:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list) - 1):
        context = token_list[i - 1:i] + token_list[i + 1:i + 2]
        target = token_list[i]
        input_sequences.append((context, target))

# Shuffle the training data to ensure randomness
np.random.shuffle(input_sequences)

# Adjust batch size to a smaller value to avoid MemoryError
batch_size = 1024

# Lists to store loss and accuracy values during training
loss_history = []
accuracy_history = []

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Model checkpoint to save the best model in both h5 and bin formats
model_checkpoint = ModelCheckpoint('shahmukhi_Cbow_model_best.h5', save_best_only=True)
model_checkpoint_bin = ModelCheckpoint('shahmukhi_Cbow_model_best.bin', save_best_only=True)

# Initialize the model outside of the loop
model = Sequential()
model.add(Embedding(total_words, 100, input_length=2))
model.add(Flatten())
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the CBOW model with smaller batches
for epoch in range(30):
    print(f"Epoch {epoch + 1} / 30")
    # Split data into smaller batches
    num_batches = len(input_sequences) // batch_size
    for i in range(num_batches):
        batch = input_sequences[i * batch_size: (i + 1) * batch_size]
        X_batch = np.array([context for context, _ in batch])
        y_batch = to_categorical([target for _, target in batch], num_classes=total_words)
        
        # Train on the batch
        history = model.fit(X_batch, y_batch, epochs=1, verbose=0)
    
    # Calculate loss and accuracy for the epoch
    loss = np.mean(history.history['loss'])
    accuracy = np.mean(history.history['accuracy'])
    loss_history.append(loss)
    accuracy_history.append(accuracy)
    
    # Log training progress
    logging.info(f'Epoch {epoch + 1}: Loss={loss:.4f}, Accuracy={accuracy:.4f}')
    print(f'Loss={loss:.4f}, Accuracy={accuracy:.4f}')

# Compute the confusion matrix
y_valid = [tokenizer.texts_to_sequences([line])[0] for line in X_valid]
y_valid = np.array([item for sublist in y_valid for item in sublist])  # Flatten the list
y_pred = model.predict(y_valid)
confusion = confusion_matrix(y_valid, np.argmax(y_pred, axis=1))

# Create a heatmap of the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, cmap='coolwarm', annot=True, fmt='d', xticklabels=False, yticklabels=False)
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')  # Save the confusion matrix plot as an image
plt.show()  # Display the plot in the notebook

# Plot the loss and accuracy over epochs
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(range(1, len(loss_history) + 1), loss_history)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss Over Epochs')
plt.savefig('loss_plot.png')  # Save the loss plot as an image

plt.subplot(1, 2, 2)
plt.plot(range(1, len(accuracy_history) + 1), accuracy_history)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy Over Epochs')
plt.savefig('accuracy_plot.png')  # Save the accuracy plot as an image

plt.tight_layout()

# Compute and plot the precision-recall curve
plt.figure(figsize=(8, 6))
precision, recall, _ = precision_recall_curve(y_valid, y_pred)
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.savefig('precision_recall_curve.png')  # Save the precision-recall curve plot as an image

# Compute and plot the ROC curve
plt.figure(figsize=(8, 6))
fpr, tpr, _ = roc_curve(y_valid, y_pred)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, marker='.')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve (AUC = {roc_auc:.2f})')
plt.savefig('roc_curve.png')  # Save the ROC curve plot as an image

# Show the plots
plt.show()
