In [None]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import string

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
data_path = '/content/sample_data/spam.csv'
sms_data = pd.read_csv(data_path, encoding='latin-1')

In [None]:
print(sms_data.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [None]:
sms_data = sms_data[['v1', 'v2']]
sms_data.columns = ['label', 'message']

In [None]:
sms_data['label'] = sms_data['label'].map({'ham': 0, 'spam': 1})

In [None]:
def text_preprocessing(text):

    text = text.lower()

    text = ''.join([char for char in text if char not in string.punctuation])

    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

In [None]:
sms_data['cleaned_message'] = sms_data['message'].apply(text_preprocessing)

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sms_data['cleaned_message'])

In [None]:
y = sms_data['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
print(f'Training set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')

Training set shape: (4457, 9376)
Test set shape: (1115, 9376)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


model = MultinomialNB()


model.fit(X_train, y_train)


y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print('Confusion Matrix:')
print(conf_matrix)


Accuracy: 0.9757847533632287
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.91      0.91      0.91       150

    accuracy                           0.98      1115
   macro avg       0.95      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
[[951  14]
 [ 13 137]]


In [None]:

def classify_sms(message):
    cleaned_message = text_preprocessing(message)
    vectorized_message = vectorizer.transform([cleaned_message])
    prediction = model.predict(vectorized_message)
    return 'spam' if prediction == 1 else 'ham'

new_message = "Congratulations! You've won a free ticket to the Bahamas. Call now!"
print(f'New message: "{new_message}" is classified as: {classify_sms(new_message)}')


New message: "Congratulations! You've won a free ticket to the Bahamas. Call now!" is classified as: spam


In [5]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
import string


nltk.download('stopwords')


data_path = '/content/sample_data/spam.csv'
sms_data = pd.read_csv(data_path, encoding='latin-1')


sms_data = sms_data[['v1', 'v2']]
sms_data.columns = ['label', 'message']


sms_data['label'] = sms_data['label'].map({'ham': 0, 'spam': 1})


def text_preprocessing(text):

    text = text.lower()

    text = ''.join([char for char in text if char not in string.punctuation])

    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text


sms_data['cleaned_message'] = sms_data['message'].apply(text_preprocessing)


vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sms_data['cleaned_message'])

y = sms_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the classifier
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print('Confusion Matrix:')
print(conf_matrix)


def classify_sms(message):
    cleaned_message = text_preprocessing(message)
    vectorized_message = vectorizer.transform([cleaned_message])
    prediction = model.predict(vectorized_message)
    return 'spam' if prediction == 1 else 'ham'

new_message = "Ok lar... Joking wif u oni..."
print(f'New message: "{new_message}" is classified as: {classify_sms(new_message)}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.9757847533632287
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.91      0.91      0.91       150

    accuracy                           0.98      1115
   macro avg       0.95      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
[[951  14]
 [ 13 137]]
New message: "Ok lar... Joking wif u oni..." is classified as: ham
