## 1. Build a NLP Language model for text generation involves train a neural network to predict the next word in a sequence of words.

In [6]:
#import libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense

In [18]:
#Sample Data
text_data=["I am selvaraj","The quick brown fox jumps over the lazy dog "]
text_data

['I am selvaraj', 'The quick brown fox jumps over the lazy dog ']

In [8]:
#Tokenization
tokenizer=tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words=len(tokenizer.word_index)+1
print(total_words)

12


In [10]:

#Create input sequences and targets
input_sequences=[]
for line in text_data:
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequences.append(n_gram_sequence)
max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
x, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [11]:
from sys import meta_path
#Build the model
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_length-1)) # input embedd
model.add(LSTM(100)) #hidden layer
model.add(Dense(total_words,activation='softmax')) #output layer
#Compile the model
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
#Train
model.fit(x,y,epochs=100,verbose=2)

Epoch 1/100
1/1 - 5s - loss: 2.4916 - accuracy: 0.1000 - 5s/epoch - 5s/step
Epoch 2/100
1/1 - 0s - loss: 2.4811 - accuracy: 0.1000 - 17ms/epoch - 17ms/step
Epoch 3/100
1/1 - 0s - loss: 2.4708 - accuracy: 0.2000 - 55ms/epoch - 55ms/step
Epoch 4/100
1/1 - 0s - loss: 2.4605 - accuracy: 0.2000 - 33ms/epoch - 33ms/step
Epoch 5/100
1/1 - 0s - loss: 2.4500 - accuracy: 0.3000 - 42ms/epoch - 42ms/step
Epoch 6/100
1/1 - 0s - loss: 2.4391 - accuracy: 0.2000 - 37ms/epoch - 37ms/step
Epoch 7/100
1/1 - 0s - loss: 2.4274 - accuracy: 0.4000 - 42ms/epoch - 42ms/step
Epoch 8/100
1/1 - 0s - loss: 2.4149 - accuracy: 0.5000 - 27ms/epoch - 27ms/step
Epoch 9/100
1/1 - 0s - loss: 2.4011 - accuracy: 0.5000 - 33ms/epoch - 33ms/step
Epoch 10/100
1/1 - 0s - loss: 2.3859 - accuracy: 0.5000 - 24ms/epoch - 24ms/step
Epoch 11/100
1/1 - 0s - loss: 2.3690 - accuracy: 0.5000 - 14ms/epoch - 14ms/step
Epoch 12/100
1/1 - 0s - loss: 2.3500 - accuracy: 0.4000 - 15ms/epoch - 15ms/step
Epoch 13/100
1/1 - 0s - loss: 2.3285 - ac

<keras.src.callbacks.History at 0x14937771b50>

In [16]:
seed_text = "i am"
next_words = 3

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    
    # Truncate or pad sequences to match the expected length
    token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list], maxlen=8, padding='pre', truncating='pre')
    
    predicted = np.argmax(model.predict(token_list, verbose=2))
    
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    
    seed_text += " " + output_word

print(seed_text)


1/1 - 0s - 30ms/epoch - 30ms/step
1/1 - 0s - 26ms/epoch - 26ms/step
1/1 - 0s - 27ms/epoch - 27ms/step
i am selvaraj selvaraj selvaraj


# 

## 2. Build a Speech to Text model.

In [22]:
import speech_recognition as sr

In [21]:
pip install SpeechRecognition


Collecting SpeechRecognition
  Downloading SpeechRecognition-3.10.0-py2.py3-none-any.whl (32.8 MB)
     ---------------------------------------- 0.0/32.8 MB ? eta -:--:--
     --------------------------------------- 0.0/32.8 MB 991.0 kB/s eta 0:00:34
     ---------------------------------------- 0.1/32.8 MB 1.3 MB/s eta 0:00:25
     ---------------------------------------- 0.2/32.8 MB 1.5 MB/s eta 0:00:22
     ---------------------------------------- 0.3/32.8 MB 1.7 MB/s eta 0:00:20
      --------------------------------------- 0.5/32.8 MB 2.2 MB/s eta 0:00:15
      --------------------------------------- 0.7/32.8 MB 2.4 MB/s eta 0:00:14
     - -------------------------------------- 0.9/32.8 MB 2.8 MB/s eta 0:00:12
     - -------------------------------------- 1.1/32.8 MB 2.9 MB/s eta 0:00:11
     - -------------------------------------- 1.2/32.8 MB 3.0 MB/s eta 0:00:11
     - -------------------------------------- 1.4/32.8 MB 3.1 MB/s eta 0:00:11
     - -------------------------------

In [25]:
# Load the audio file
samp=sr.AudioFile("harvard.wav")
samp

<speech_recognition.AudioFile at 0x1493dcd1c90>

In [27]:
# With this as source,'record' the audio from the file
recog=sr.Recognizer()
with samp as source:
    audio=recog.record(samp)

#### Convert the audio to text

In [28]:
# Method 1: Using 'recognize_google' method,convert the audio to text
res=recog.recognize_google(audio)
print('Text for the Audio:\n')
print(res)


Text for the Audio:

the still smell of old buildings it takes heat to bring out the order I call dip restores health exist a sole please find with him Tales of pastor are my favourite is just for food is the hot cross bun


In [30]:
# Method 2: create a function to convert text from speech
def speech_to_text(file):
    samp=sr.AudioFile(file)
    with samp as source:
        audio=recog.record(samp)
    return recog.recognize_google(audio)


In [32]:
op_text=speech_to_text("harvard.wav")
print(op_text)

the still smell of old buildings it takes heat to bring out the order I call dip restores health exist a sole please find with him Tales of pastor are my favourite is just for food is the hot cross bun


In [33]:
# Preprocess the Text

from nltk.tokenize import word_tokenize
tokens=word_tokenize(op_text.lower())
tokens


['the',
 'still',
 'smell',
 'of',
 'old',
 'buildings',
 'it',
 'takes',
 'heat',
 'to',
 'bring',
 'out',
 'the',
 'order',
 'i',
 'call',
 'dip',
 'restores',
 'health',
 'exist',
 'a',
 'sole',
 'please',
 'find',
 'with',
 'him',
 'tales',
 'of',
 'pastor',
 'are',
 'my',
 'favourite',
 'is',
 'just',
 'for',
 'food',
 'is',
 'the',
 'hot',
 'cross',
 'bun']

In [36]:
feature_list=['food','health','cross bun','pastor','smell']
feature_list


['food', 'health', 'cross bun', 'pastor', 'smell']

In [37]:
# Identify which of the features are being talked of in the audio review file

review_features=[term for term in tokens if term in feature_list]
review_features

['smell', 'health', 'pastor', 'food']

In [38]:
review_features=list(set(review_features))
review_features

['food', 'health', 'pastor', 'smell']

### Recording the speech from our own system microphone

In [41]:
# Recording from mic

mic=sr.Microphone()
mic.list_microphone_names()

['Microsoft Sound Mapper - Input',
 'Microphone Array (Realtek(R) Au',
 'Microsoft Sound Mapper - Output',
 'Speaker / Headphone (Realtek(R)',
 'Primary Sound Capture Driver',
 'Microphone Array (Realtek(R) Audio)',
 'Primary Sound Driver',
 'Speaker / Headphone (Realtek(R) Audio)',
 'Speaker / Headphone (Realtek(R) Audio)',
 'Microphone Array (Realtek(R) Audio)',
 'Speakers (Realtek HD Audio output)',
 'Stereo Mix (Realtek HD Audio Stereo input)',
 'Microphone (Realtek HD Audio Mic input)']

In [40]:
pip install pyaudio

Collecting pyaudio
  Obtaining dependency information for pyaudio from https://files.pythonhosted.org/packages/82/d8/f043c854aad450a76e476b0cf9cda1956419e1dacf1062eb9df3c0055abe/PyAudio-0.2.14-cp311-cp311-win_amd64.whl.metadata
  Downloading PyAudio-0.2.14-cp311-cp311-win_amd64.whl.metadata (2.7 kB)
Downloading PyAudio-0.2.14-cp311-cp311-win_amd64.whl (164 kB)
   ---------------------------------------- 0.0/164.1 kB ? eta -:--:--
   --------- ----------------------------- 41.0/164.1 kB 991.0 kB/s eta 0:00:01
   ---------------------------------- ----- 143.4/164.1 kB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 164.1/164.1 kB 1.6 MB/s eta 0:00:00
Installing collected packages: pyaudio
Successfully installed pyaudio-0.2.14
Note: you may need to restart the kernel to use updated packages.


In [44]:
with mic as source:
    audio=recog.listen(source)
recog.recognize_google(audio)

'food is amazing'

In [46]:
def get_review_features(review_text):
        feature_list=['food','health','cross bun','pastor','smell']
        tokens=word_tokenize(review_text.lower())
        review_features=[term for term in tokens if term in feature_list]
        review_features=list(set(review_features))
        return review_features

res=recog.recognize_google(audio)
get_review_features(res)


['food']

# 

# 3. Build a Text to Speech model.

In [49]:
# Google Text-to-Speech (gTTS) library
pip install gtts

Collecting gtts
  Obtaining dependency information for gtts from https://files.pythonhosted.org/packages/a7/ef/190f64a4edeb13165e3c598a08f06a2ae80cdae0aa208c96c20efdb7ad4b/gTTS-2.4.0-py3-none-any.whl.metadata
  Downloading gTTS-2.4.0-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.4.0-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.4.0
Note: you may need to restart the kernel to use updated packages.


In [50]:
from gtts import gTTS
import os
def text_to_speech(text,language='en',filename='output.mp3'):
    tts=gTTS(text=text,lang=language,slow=False)
    tts.save(filename)
    os.system(f"start {filename}")

if __name__=="__main__":
    input_text=input('User text pl >>:')
    text_to_speech(input_text)

User text pl >>:pip install gtts,After running this command, you should be able to import and use the gTTS module in your Python script without encountering the ModuleNotFoundError


# 

# 4. Build a NLP Language model to detect the sentence/word error in the text corpus.

In [52]:
pip install spacy

Collecting spacy
  Obtaining dependency information for spacy from https://files.pythonhosted.org/packages/90/f0/0133b684e18932c7bf4075d94819746cee2c0329f2569db526b0fa1df1df/spacy-3.7.2-cp311-cp311-win_amd64.whl.metadata
  Downloading spacy-3.7.2-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Obtaining dependency information for spacy-loggers<2.0.0,>=1.0.0 from https://files.pythonhosted.org/packages/33/78/d1a1a026ef3af911159398c939b1509d5c36fe524c7b644f34a5146c4e16/spacy_loggers-1.0.5-py3-none-any.whl.metadata
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Obtaining dependency information for murmurhash<1.1.0,>=0.28.0 from https://files.pythonhosted.org/packages/71/46/af01a20ec368bd9cb49a1d2df15e3eca113bbf6952cc1f2a47f1c6801a7f/murmurhash-1.0.10-cp311

In [19]:
import spacy
from spacy.tokens import Doc

In [20]:
# Load a pre-trained spaCy language model and tagger
nlp = spacy.load("en_core_web_sm")

In [25]:
# Define a custom pipeline component for error detection
def detect_errors(doc):
    errors = []

    # Check for missing articles (a, an, the)
    for token in doc:
        if token.pos_ == "ADJ" and token.head.pos_ == "NOUN" and not token.head.text.lower() in ["a", "an", "the"]:
            errors.append({"text": token.text, "start": token.idx, "end": token.idx + len(token.text), "error_type": "Missing article"})

    # Check for subject-verb agreement
    for verb in doc:
        if verb.pos_ == "VERB" and verb.head.pos_ == "NOUN" and verb.number != verb.head.number:
            errors.append({"text": verb.text, "start": verb.idx, "end": verb.idx + len(verb.text), "error_type": "Subject-verb agreement error"})

    # Add errors to doc as a custom extension attribute
    doc._.errors = errors

    return doc


In [26]:
# Example text with intentional errors
text = "The cat are playing in the garden."
doc = nlp(text)