# Text Classification 

- Email (Spam/Ham)

In [3]:
# %pip install pandas
# %pip install scikit-learn 

### Step 1: Import Necessary Libraries

In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')

### Step 2: Load the Dataset

In [5]:
data = pd.read_csv("spam.csv")
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
data["Message"][5567]

'This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.'

### Step 3: Data Preprocessing

In [7]:
# Download and prepare stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# # Tokenization and text cleaning
# data['Message'] = data['Message'].apply(lambda x: ' '.join(word.lower() for word in word_tokenize(x) if word.isalpha()))

# # Stop words removal
# data['Message'] = data['Message'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))



# Tokenization, text cleaning, and punctuation removal
def preprocess_text(text):
    text = ' '.join(word.lower() for word in word_tokenize(text) if word.isalpha())
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply preprocessing to the 'Message' column
data['Message'] = data['Message'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\henil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
data['Message'][0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

### Step 4: Feature Extraction

In [9]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Message'])

### Step 5: Split the Data into Training and Testing Sets

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, data['Category'], test_size=0.2, random_state=42)

### Step 6: Build and Train the Model

In [11]:
model = MultinomialNB()
model.fit(X_train, y_train)

### Step 7: Model Evaluation

In [12]:
y_pred = model.predict(X_test)

### Step 8: Print the Results

In [13]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9713004484304932
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [14]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
# Create the deep learning model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))  # Input layer
model.add(Dropout(0.2))  # Dropout layer for regularization
model.add(Dense(64, activation='relu'))  # Hidden layer
model.add(Dropout(0.2))  # Dropout layer for regularization
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train.toarray(), y_train.map({'ham': 0, 'spam': 1}), epochs=10, batch_size=8, validation_split=0.2)


Epoch 1/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.8781 - loss: 0.3574 - val_accuracy: 0.9529 - val_loss: 0.1626
Epoch 2/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9763 - loss: 0.0814 - val_accuracy: 0.9675 - val_loss: 0.1257
Epoch 3/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9840 - loss: 0.0470 - val_accuracy: 0.9697 - val_loss: 0.1197
Epoch 4/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9831 - loss: 0.0433 - val_accuracy: 0.9731 - val_loss: 0.1212
Epoch 5/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.9917 - loss: 0.0205 - val_accuracy: 0.9697 - val_loss: 0.1209
Epoch 6/10
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.9925 - loss: 0.0238 - val_accuracy: 0.9787 - val_loss: 0.1461
Epoch 7/10
[1m446/446[0m 

In [15]:

# Make predictions
y_pred = model.predict(X_test.toarray())
y_pred_classes = (y_pred > 0.5).astype(int).flatten()  # Convert probabilities to binary class labels

# Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test.map({'ham': 0, 'spam': 1}), y_pred_classes))
print(classification_report(y_test.map({'ham': 0, 'spam': 1}), y_pred_classes))

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Accuracy: 0.9775784753363229
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.97      0.86      0.91       149

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [16]:
# Save the model
model.save('sentiment_analysis_model.h5')

# Load the model
loaded_model = load_model('sentiment_analysis_model.h5')



In [17]:
# Function to predict sentiment on custom input
def predict_sentiment(sentence):
    # Preprocess the custom sentence
    sentence_cleaned = preprocess_text(sentence)
    
    # Transform using the same vectorizer
    sentence_tfidf = vectorizer.transform([sentence_cleaned]).toarray()
    
    # Make prediction
    prediction = loaded_model.predict(sentence_tfidf)
    print(prediction)
    sentiment = 'ham' if prediction[0][0] < 0.8 else 'spam'
    return sentiment

# Custom input for prediction
custom_input = "You have won a free ticket to the concert!"
custom_input = 'This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.'

predicted_sentiment = predict_sentiment(custom_input)
print(f"Predicted Sentiment: {predicted_sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[[0.99999046]]
Predicted Sentiment: spam
