In [14]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load the dataset from CSV
df = pd.read_excel('/content/drive/MyDrive/cricket.xlsx')  # Replace 'your_dataset.csv' with the actual path to your CSV file

# Preprocess the data
texts = df['bangla_text'].values
labels = df['sentiment_label'].values

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [25]:
df

Unnamed: 0,bangla_text,category,sentiment_label
0,জয় বাংলা কাপ! তাও আবার স্বাধীনতার মাস মার্চে। ...,other,positive
1,জয় বাংলা কাপ! তাও আবার স্বাধীনতার মাস মার্চে। ...,team,positive
2,বাংলাদেশের পরে ভারতের সাপর্ট ই করি ?,team,positive
3,সৌম্যকে বাদ দেওয়া হোক,batting,negative
4,"প্রথমটি হচ্ছে, কোচ অত:পর সাকিব,সাকিব আর সাকিবর...",team,positive
...,...,...,...
2974,নির্বাচকমণ্ডলী দের গালে জুতা মারা উচিত.. হালা...,team management,negative
2975,"বস ভাল করবে ইটাই আশা করি,দারুন এরজন ব্যাটিং অল...",batting,negative
2976,"বস ভাল করবে ইটাই আশা করি,দারুন এরজন ব্যাটিং অল...",bowling,negative
2977,গুগল সার্চ করে মিনহাজের রানরেট দেখে নিন? এভারে...,team management,negative


In [15]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)

vocab_size = len(tokenizer.word_index) + 1

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)


In [16]:
# Pad sequences
max_length = 100  # Specify the maximum sequence length
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')


In [17]:
# Map sentiment labels to numerical values
label_mapping = {'negative': 0, 'positive': 1, 'neutral': 2}  # Update with your actual label mapping
train_labels = np.array([label_mapping[label] for label in train_labels])
test_labels = np.array([label_mapping[label] for label in test_labels])


In [18]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [19]:
# Define the LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(LSTM(64))
model.add(Dense(3, activation='softmax'))  # Adjust the number of units based on the number of sentiment classes

In [20]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [21]:
# Train the model
model.fit(train_padded, train_labels, epochs=10, validation_data=(test_padded, test_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc25f077e50>

In [22]:
# Evaluate the model
loss, accuracy = model.evaluate(test_padded, test_labels)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 0.7489497661590576
Test Accuracy: 0.7265100479125977
