In [7]:

import requests
import zipfile
import io
import pandas as pd

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"

# Download the zip file from the URL
response = requests.get(url)
response.raise_for_status()  # Raise an exception for bad responses (4xx or 5xx)

# Read the zip file from the response content
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
    # Extract the 'SMSSpamCollection' file
    with zip_ref.open('SMSSpamCollection') as file:
        # Read the CSV data into a DataFrame
        df = pd.read_csv(file, sep='\t', names=['label', 'message'])

# Preview the dataset
print(df.head())
print("\nDataset Info:")
print(df.info())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the punkt_tab data

# Preprocess text function
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return " ".join(lemmatized_tokens)

# Apply preprocessing
df['processed_message'] = df['message'].apply(preprocess_text)

# Preview preprocessed text
print(df[['message', 'processed_message']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                   processed_message  
0  go jurong point crazy available bugis n great ...  
1                            ok lar joking wif u oni  
2  free entry wkly comp win fa cup final tkts may...  
3                u dun say early hor u c already say  
4                nah think go usf life around though  


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_message'])
y = df['label']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9665071770334929

Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1448
        spam       0.99      0.75      0.86       224

    accuracy                           0.97      1672
   macro avg       0.98      0.88      0.92      1672
weighted avg       0.97      0.97      0.96      1672



In [11]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize VADER
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Apply sentiment analysis
df['sentiment_score'] = df['message'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Preview sentiment scores
print(df[['message', 'sentiment_score']].head())


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


                                             message  sentiment_score
0  Go until jurong point, crazy.. Available only ...           0.6249
1                      Ok lar... Joking wif u oni...           0.4767
2  Free entry in 2 a wkly comp to win FA Cup fina...           0.7964
3  U dun say so early hor... U c already then say...           0.0000
4  Nah I don't think he goes to usf, he lives aro...          -0.1027


In [12]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Apply NER on sample messages
for doc in df['message'].head(5):
    nlp_doc = nlp(doc)
    print(f"Message: {doc}")
    for ent in nlp_doc.ents:
        print(f" - {ent.text} ({ent.label_})")


Message: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
 - jurong point (PERSON)
Message: Ok lar... Joking wif u oni...
Message: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
 - 2 (CARDINAL)
 - FA Cup (EVENT)
 - May 2005 (DATE)
 - 87121 (CARDINAL)
 - rate)T&C (DATE)
 - 08452810075over18 (DATE)
Message: U dun say so early hor... U c already then say...
Message: Nah I don't think he goes to usf, he lives around here though
 - Nah (PERSON)
