# Text Classification 

- Email (Spam/Ham)

### Step 1: Import Necessary Libraries

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

### Step 2: Load the Dataset

In [3]:
data = pd.read_csv('../Dataset/spam.csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
data['Message'][:10]

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry wkly comp win fa cup final tkts may...
3                  u dun say early hor u c already say
4               nah think goes usf lives around though
5    freemsg hey darling week word back like fun st...
6       even brother like speak treat like aids patent
7    per request melle oru minnaminunginte nurungu ...
8    winner valued network customer selected receiv...
9    mobile months u r entitled update latest colou...
Name: Message, dtype: object

### Step 3: Data Preprocessing

In [12]:
# Download and prepare stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Tokenization and text cleaning
data['Message'] = data['Message'].apply(lambda x: ' '.join(word.lower() for word in word_tokenize(x) if word.isalpha()))

# Stop words removal
data['Message'] = data['Message'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ravia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Step 4: Feature Extraction

In [5]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Message'])

### Step 5: Split the Data into Training and Testing Sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, data['Category'], test_size=0.2, random_state=42)

### Step 6: Build and Train the Model

In [7]:
model = MultinomialNB()
model.fit(X_train, y_train)

### Step 7: Model Evaluation

In [8]:
y_pred = model.predict(X_test)

### Step 8: Print the Results

In [9]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9713004484304932
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [14]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout

# Create the deep learning model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))  # Input layer
model.add(Dropout(0.2))  # Dropout layer for regularization
model.add(Dense(64, activation='relu'))  # Hidden layer
model.add(Dropout(0.2))  # Dropout layer for regularization
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train.toarray(), y_train.map({'ham': 0, 'spam': 1}), epochs=10, batch_size=8, validation_split=0.2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: object __array__ method not producing an array