In [1]:
!pip install numpy pandas tensorflow nltk scikit-learn



In [2]:
import pandas as pd

# Load the dataset to examine its structure
file_path = '/content/train.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [3]:
import nltk
# Explicitly download all necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')  # Additional download for resolving the error

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from imblearn.over_sampling import SMOTE  # For handling imbalanced data

# Ensure NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Load the dataset
file_path = "train.csv"  # Update with your file path
data = pd.read_csv(file_path)

# Data Overview
print("Dataset Preview:")
print(data.head())

# Ensure labels are binary
if not pd.api.types.is_numeric_dtype(data['label']):
    data['label'] = data['label'].map({'spam': 1, 'ham': 0})  # Adjust based on dataset

# Preprocessing function
def preprocess_text(text):
    stop_words = set(stopwords.words('english')) - {'won', 'gift', 'call', 'free', 'prize'}  # Allow important words
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = text.lower()             # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d+', '', text)   # Remove numbers
    return ' '.join(word for word in text.split() if word not in stop_words)

data['sms'] = data['sms'].apply(preprocess_text)

# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['sms'])
X = tokenizer.texts_to_sequences(data['sms'])
max_len = 50  # Maximum sequence length
X = pad_sequences(X, maxlen=max_len)

# Labels
y = data['label'].values

# Handle imbalanced data
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the LSTM model
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size
embedding_dim = 128  # Increased embedding dimension for better learning

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    SpatialDropout1D(0.3),
    LSTM(150, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
print("Training the model...")
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Evaluate the model
print("Evaluating the model...")
y_pred = (model.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"\nAccuracy: {accuracy}")
print("\nClassification Report:\n", report)

# Test with a sample input
def predict_spam(message):
    processed = preprocess_text(message)
    tokenized = tokenizer.texts_to_sequences([processed])
    padded = pad_sequences(tokenized, maxlen=max_len)
    prediction = model.predict(padded, verbose=0)
    return "Spam" if prediction > 0.5 else "Not spam"

# Example Prediction
example_sms = "Congratulations! You've won a $1000 Walmart gift card. Call now!"
print("\nExample Prediction:")
print(f"Message: {example_sms}")
print(f"Prediction: {predict_spam(example_sms)}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dataset Preview:
                                                 sms  label
0  Go until jurong point, crazy.. Available only ...      0
1                    Ok lar... Joking wif u oni...\n      0
2  Free entry in 2 a wkly comp to win FA Cup fina...      1
3  U dun say so early hor... U c already then say...      0
4  Nah I don't think he goes to usf, he lives aro...      0




Training the model...
Epoch 1/10
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 201ms/step - accuracy: 0.8140 - loss: 0.3880 - val_accuracy: 0.9052 - val_loss: 0.2322
Epoch 2/10
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 211ms/step - accuracy: 0.9431 - loss: 0.1615 - val_accuracy: 0.9047 - val_loss: 0.2627
Epoch 3/10
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 197ms/step - accuracy: 0.9680 - loss: 0.1031 - val_accuracy: 0.9083 - val_loss: 0.2698
Epoch 4/10
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 196ms/step - accuracy: 0.9801 - loss: 0.0733 - val_accuracy: 0.9083 - val_loss: 0.3723
Epoch 5/10
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 195ms/step - accuracy: 0.9843 - loss: 0.0546 - val_accuracy: 0.8990 - val_loss: 0.4223
Epoch 6/10
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 199ms/step - accuracy: 0.9856 - loss: 0.0488 - val_accuracy: 0.8933 - val_l

In [5]:
print(predict_spam("Congratulations! You've won a free trip to Paris!"))


Spam


In [6]:
print(predict_spam("Hey, let's catch up tomorrow!"))

Not spam
