### Text Classification 

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [2]:
data = pd.read_csv("spam.csv")

###  Data Preprocessing

In [3]:
# Download and prepare stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = ' '.join(word.lower() for word in word_tokenize(text) if word.isalpha())
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

data['Message'] = data['Message'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAJVI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Download and prepare stop words
#nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

# Tokenization and text cleaning
#data['Message'] = data['Message'].apply(lambda x: ' '.join(word.lower() for word in word_tokenize(x) if word.isalpha()))

# Stop words removal
#data['Message'] = data['Message'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAJVI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
data['Message'][0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

###  Feature Extraction

In [5]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Message'])

###  Split the Data into Training and Testing Sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, data['Category'], test_size=0.2, random_state=42)

###  Build and Train the Model

In [7]:
model = MultinomialNB()
model.fit(X_train, y_train)

###  Model Evaluation

In [8]:
y_pred = model.predict(X_test)

###  Print the Results

In [9]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9713004484304932
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [11]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
# Create the deep learning model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))  # Input layer
model.add(Dropout(0.2))  # Dropout layer for regularization
model.add(Dense(64, activation='relu'))  # Hidden layer
model.add(Dropout(0.2))  # Dropout layer for regularization
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train.toarray(), y_train.map({'ham': 0, 'spam': 1}), epochs=10, batch_size=8, validation_split=0.2)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8534 - loss: 0.3945 - val_accuracy: 0.9540 - val_loss: 0.1481
Epoch 2/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9719 - loss: 0.0919 - val_accuracy: 0.9652 - val_loss: 0.1290
Epoch 3/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9804 - loss: 0.0626 - val_accuracy: 0.9697 - val_loss: 0.1258
Epoch 4/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9889 - loss: 0.0366 - val_accuracy: 0.9709 - val_loss: 0.1202
Epoch 5/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9909 - loss: 0.0259 - val_accuracy: 0.9742 - val_loss: 0.1324
Epoch 6/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9923 - loss: 0.0244 - val_accuracy: 0.9720 - val_loss: 0.1269
Epoch 7/10
[1m446/446[0m [32m━━━━━━━

In [12]:

# Make predictions
y_pred = model.predict(X_test.toarray())
y_pred_classes = (y_pred > 0.5).astype(int).flatten()  

# Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test.map({'ham': 0, 'spam': 1}), y_pred_classes))
print(classification_report(y_test.map({'ham': 0, 'spam': 1}), y_pred_classes))

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.9775784753363229
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       966
           1       0.95      0.88      0.91       149

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [13]:
# Save the model
model.save('sentiment_analysis_model.h5')

# Load the model
loaded_model = load_model('sentiment_analysis_model.h5')



In [14]:
# Function to predict sentiment on custom input
def predict_sentiment(sentence):
    # Preprocess the custom sentence
    sentence_cleaned = preprocess_text(sentence)
    
    # Transform using the same vectorizer
    sentence_tfidf = vectorizer.transform([sentence_cleaned]).toarray()
    
    # Make prediction
    prediction = loaded_model.predict(sentence_tfidf)
    print(prediction)
    sentiment = 'ham' if prediction[0][0] < 0.8 else 'spam'
    return sentiment

# Custom input for prediction
custom_input = "You have won a free ticket to the concert!"
custom_input = 'This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.'

predicted_sentiment = predict_sentiment(custom_input)
print(f"Predicted Sentiment: {predicted_sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[[0.99998844]]
Predicted Sentiment: spam
