# Day6-NLP Practice Assignment
    1.	Build a NLP Language model for text generation involves train a neural network to predict the next word in a sequence of words.
    2.	Build a Speech to Text model.
    3.	Build a Text to Speech model.
    4.	Build a NLP Language model to detect the sentence/word error in the text corpus.
    5.	Build a Language model to correct the error in the text

# 1.	Build a NLP Language model for text generation involves train a neural network to predict the next word in a sequence of words.
    

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample data
corpus = [
    "The quick brown fox",
    "jumps over the lazy dog",
    "She sells seashells by the seashore"
]

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

# Create input sequences and targets
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

x, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Model Architecture
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x, y, epochs=100, verbose=2)

# Generate text completion
seed_text = "The quick"
next_words = 5

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')

    predicted = np.argmax(model.predict(token_list, verbose=0))
    output_word = ""

    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break

    seed_text += " " + output_word

print(seed_text)

Epoch 1/100
1/1 - 4s - loss: 2.6363 - accuracy: 0.0833 - 4s/epoch - 4s/step
Epoch 2/100
1/1 - 0s - loss: 2.6278 - accuracy: 0.2500 - 13ms/epoch - 13ms/step
Epoch 3/100
1/1 - 0s - loss: 2.6190 - accuracy: 0.3333 - 17ms/epoch - 17ms/step
Epoch 4/100
1/1 - 0s - loss: 2.6100 - accuracy: 0.4167 - 12ms/epoch - 12ms/step
Epoch 5/100
1/1 - 0s - loss: 2.6007 - accuracy: 0.3333 - 10ms/epoch - 10ms/step
Epoch 6/100
1/1 - 0s - loss: 2.5909 - accuracy: 0.3333 - 10ms/epoch - 10ms/step
Epoch 7/100
1/1 - 0s - loss: 2.5806 - accuracy: 0.3333 - 12ms/epoch - 12ms/step
Epoch 8/100
1/1 - 0s - loss: 2.5697 - accuracy: 0.2500 - 12ms/epoch - 12ms/step
Epoch 9/100
1/1 - 0s - loss: 2.5579 - accuracy: 0.3333 - 11ms/epoch - 11ms/step
Epoch 10/100
1/1 - 0s - loss: 2.5453 - accuracy: 0.3333 - 10ms/epoch - 10ms/step
Epoch 11/100
1/1 - 0s - loss: 2.5316 - accuracy: 0.3333 - 11ms/epoch - 11ms/step
Epoch 12/100
1/1 - 0s - loss: 2.5168 - accuracy: 0.3333 - 11ms/epoch - 11ms/step
Epoch 13/100
1/1 - 0s - loss: 2.5006 - ac

# 2.	Build a Speech to Text model.
  

In [4]:
#pip install pyaudio
#pip install SpeechRecognition

import speech_recognition as sr

def speech_to_text():
    recognizer = sr.Recognizer()

    with sr.Microphone() as source:
        print("Say something:")
        audio = recognizer.listen(source)

    try:
        text = recognizer.recognize_google(audio)
        print("Text from speech:", text)
    except sr.UnknownValueError:
        print("Speech Recognition could not understand audio")
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")

if __name__ == "__main__":
    speech_to_text()

Say something:
Text from speech: hello


# 3.	Build a Text to Speech model.   

In [1]:
# pip install gtts
from gtts import gTTS
import os

def text_to_speech(text, language='en', filename='output.mp3'):
    tts = gTTS(text=text, lang=language, slow=False)
    tts.save(filename)
    os.system(f"start {filename}")

if __name__ == "__main__":
    input_text = input('Enter text for Text-to-Speech: ')
    text_to_speech(input_text)

Enter text for Text-to-Speech: hiii how are you


# 4.	Build a NLP Language model to detect the sentence/word error in the text corpus.

In [9]:
import re
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Sample dataset of correctly spelled and misspelled words
correct_words = ['hello', 'world', 'python', 'spell', 'language', 'model', 'check']
misspelled_words = ['helo', 'worl', 'pythoon', 'spl', 'langage', 'moel', 'chek']

# Combine correct and misspelled words into a single dataset
all_words = correct_words + misspelled_words

# Labels (1 for correct, 0 for incorrect)
labels = [1] * len(correct_words) + [0] * len(misspelled_words)

# Preprocess the data
def preprocess_text(text):
    text = re.sub(r'\b\w\b', '', text)  # Remove single characters
    return text.lower()

all_words = [preprocess_text(word) for word in all_words]

# Split the dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(all_words, labels, test_size=0.2, random_state=42)

# Vectorize the words with Bag of Words representation
cv = CountVectorizer()
x_train_cv = cv.fit_transform(x_train)
x_test_cv = cv.transform(x_test)

# Classifier model with Naive Bayes Algorithm
clf = MultinomialNB()
clf.fit(x_train_cv, y_train)

# Test
y_pred = clf.predict(x_test_cv)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Test the model with new examples
def detect_error(text):
    text = preprocess_text(text)
    text_cv = cv.transform([text])
    prediction = clf.predict(text_cv)
    return "Correct" if prediction[0] == 1 else "Incorrect"

#input text
text1 = "hello"
text2 = "helo"

print(f"Prediction for '{text1}': {detect_error(text1)}")
print(f"Prediction for '{text2}': {detect_error(text2)}")


Accuracy: 33.33%
Prediction for 'hello': Correct
Prediction for 'helo': Incorrect


# 5.	Build a Language model to correct the error in the text

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def correct_errors_language_model(text):
    # Load pre-trained GPT-2 model and tokenizer
    model_name = 'gpt2'
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    # Tokenize the input text
    input_ids = tokenizer.encode(text, return_tensors='pt')

    # Generate corrected text using the language model
    output = model.generate(input_ids, max_length=150, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95)

    # Decode the generated tokens back to text
    corrected_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return corrected_text


print("Original Text:")
print(input_text)
print("\nCorrected Text:")
print(corrected_text)
