In [None]:
#dataset merge
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Step 1: Load and preprocess your dataset
# data = pd.read_csv('')  # Update with your dataset file path
# import dataset
pd.set_option('display.max_colwidth', 100)

data = pd.read_csv(r'D:\pythonana\suggestiondataset.csv', encoding='latin-1')

# Extract the text and label columns


In [None]:
from bs4 import BeautifulSoup
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# Download the stop words corpus
# nltk.download('stopwords')

stops = set(stopwords.words('english')) #english stopwords

stemmer = SnowballStemmer('english') #SnowballStemmer

def review_to_words(raw_review):
    # 1. Delete HTML 
    try:
        review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
        # 2. Make a space
        letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
        # 3. lower letters
        words = letters_only.lower().split()
        # 5. Stopwords 
        meaningful_words = [w for w in words if not w in stops]
        # 6. Stemming
        stemming_words = [stemmer.stem(w) for w in meaningful_words]
        # 7. space join words
        return( ' '.join(stemming_words))
    except:
        return "i"

In [None]:
data['Review'] = data['Review'].apply(review_to_words)

In [None]:
texts = data['Review'].tolist()

labels = data['Suggestion/Not'].tolist()

# Step 2: Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Step 3: Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(texts)

# Step 4: Pad sequences to have equal length
max_len = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_len)

# Step 5: Split your dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, encoded_labels, test_size=0.2, random_state=42)

# Step 6: Build the CNN model
embedding_dim = 100
filters = 64
kernel_size = 5

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Step 7: Train the model
batch_size = 32
epochs = 10
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, epochs=epochs)

# Step 8: Evaluate the model
y_pred = np.argmax(model.predict(X_test), axis=-1)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Step 9: Decode the predicted labels
decoded_labels = label_encoder.inverse_transform(y_pred)


In [None]:
# Preprocess the incoming data
incoming_text = "It would be great when we get it fastly"
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# # Initialize the stemmer
# stemmer = PorterStemmer()

# # Example sentence
# # sentence = "The quick brown foxes jumped over the lazy dogs"

# # Tokenize the sentence
# tokens = word_tokenize(incoming_text)

# # Apply stemming to each token
# incoming_text = " ".join([stemmer.stem(token) for token in tokens])

# Print the stemmed words
print(incoming_text)
incoming_sequence = tokenizer.texts_to_sequences([incoming_text])
incoming_padded_sequence = pad_sequences(incoming_sequence, maxlen=max_len)

# Predict the sentiment label for the incoming data
predicted_label = np.argmax(model.predict(incoming_padded_sequence), axis=-1)

# Decode the predicted label
decoded_label = label_encoder.inverse_transform(predicted_label)[0]

# print("Incoming Text:", incoming_text)
print("Predicted Label:", decoded_label)

In [None]:
# Step 10: Save the trained model
import tensorflow as tf
tf.autograph.set_verbosity(0)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

# Set Autograph verbosity to suppress the warning
tf.autograph.set_verbosity(0)

# Rest of your code

# Save the model




In [None]:
# Load the saved model
# Save the model
model.save("suggestionmodel.h5")
# Load the model
loaded_model = tf.keras.models.load_model("suggestionmodel.h5")



In [None]:
# save tokenizer
import pickle

# Save the tokenizer
with open('suggestiontokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [None]:
#save label_encoder = LabelEncoder()
with open('suggestion_label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)


In [None]:
# predict the model
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the saved model
loaded_model = tf.keras.models.load_model("D:\pythonana\suggestionmodel.h5")

# Load the tokenizer
with open(r"D:\pythonana\suggestiontokenizer.pkl", 'rb') as f:
    tokenizer = pickle.load(f)

# Load the label_encoder
with open(r"D:\pythonana\suggestion_label_encoder.pkl", 'rb') as f:
    label_encoder = pickle.load(f)




# Define your new text data
new_texts = ['It"s super cool when the product has been arrived fast', 'It should ok when it is high quality', 'not suggestion']

# Tokenize the new text data
sequences = tokenizer.texts_to_sequences(new_texts)
max_len = loaded_model.input_shape[1]
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Make predictions on the new data
predictions = loaded_model.predict(padded_sequences)

# Decode the predictions
decoded_predictions = label_encoder.inverse_transform(np.argmax(predictions, axis=-1))



# Print the predictions
for text, prediction in zip(new_texts, decoded_predictions):
    print(f'Text: {text}')
    print(f'Prediction: {prediction}')
    print()