<a href="https://colab.research.google.com/github/soheldatta17/Error-Detection-English-Grammar/blob/main/Error_Detection_English_Grammar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow pandas numpy scikit-learn


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.utils import to_categorical

# Load the dataset
data = pd.read_csv('/content/misspelled.csv')

# Drop the index column if not needed
data = data.drop(columns=['Unnamed: 0'])

# Use rows 0 to 600 (index starts from 0, so it's 0 to 599)
data_subset = data.iloc[0:600]

# Handle missing values: fill missing values with an empty string
data_subset['input'] = data_subset['input'].fillna('')

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_subset['input'])
X = tokenizer.texts_to_sequences(data_subset['input'])
X = pad_sequences(X)

# Encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data_subset['label'])
y = to_categorical(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Neural Network model with enhanced architecture
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=256))  # Increased embedding dimension
model.add(LSTM(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))  # Increased LSTM units
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))  # Increased LSTM units
model.add(Dense(128, activation='relu'))  # Added dense layer
model.add(Dropout(0.5))  # Dropout layer for regularization
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model for 50 epochs to ensure thorough training
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy:.4f}')

def correct_spelling(misspelled_word):
    # Vectorize the input word
    word_sequence = tokenizer.texts_to_sequences([misspelled_word])
    word_sequence = pad_sequences(word_sequence, maxlen=X.shape[1])

    # Predict the correct word
    predicted_label = model.predict(word_sequence)
    correct_word = label_encoder.inverse_transform([np.argmax(predicted_label)])

    return correct_word[0]



Epoch 1/50


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_subset['input'] = data_subset['input'].fillna('')


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 64ms/step - accuracy: 0.0000e+00 - loss: 6.2074 - val_accuracy: 0.0000e+00 - val_loss: 6.2128
Epoch 2/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.0082 - loss: 6.2019 - val_accuracy: 0.0000e+00 - val_loss: 6.2242
Epoch 3/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.0038 - loss: 6.1951 - val_accuracy: 0.0000e+00 - val_loss: 6.2417
Epoch 4/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.0061 - loss: 6.1823 - val_accuracy: 0.0000e+00 - val_loss: 6.2845
Epoch 5/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.0013 - loss: 6.1530 - val_accuracy: 0.0000e+00 - val_loss: 6.4588
Epoch 6/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.0052 - loss: 6.0258 - val_accuracy: 0.0000e+00 - val_loss: 7.2531
Epoch 7/50
[1m14

In [15]:
misspelled_words = [
    'accomodate',
    'acheive',
    'arguement',
    'benefical',
    'beleive',
    'buisness',
    'calender',
    'catergory',
    'cemetary'
]

for i in misspelled_words:
  print(f'Misspelled: {i}')
  print(f'Corrected: {correct_spelling(i)}')
  print()


Misspelled: accomodate
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Corrected: accommodate

Misspelled: acheive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Corrected: achieve

Misspelled: arguement
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Corrected: bankruptcy

Misspelled: benefical
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Corrected: bankruptcy

Misspelled: beleive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Corrected: believe

Misspelled: buisness
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Corrected: business

Misspelled: calender
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Corrected: calendar

Misspelled: catergory
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Corrected: bankruptcy

Misspelled: cemetary
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s