In [17]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
# 1. Load the dataset using Pandas
file_path = "C:\\Users\\Dell\\Desktop\\New folder\\SMSSpamCollection"
df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])

In [18]:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


In [19]:
# 4. Identify and handle any missing values
missing_values = df.isnull().sum()
print("Missing values:")
print(missing_values)


Missing values:
label      0
message    0
dtype: int64


In [20]:
# 5. Check for duplicate rows and remove them if necessary
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
df_cleaned = df.drop_duplicates()
print(f"Dataset after removing duplicates: {df_cleaned.shape}")

Number of duplicate rows: 403
Dataset after removing duplicates: (5169, 2)


In [21]:
# 6. Calculate the distribution of class labels (e.g., 'ham' vs. 'spam')
label_distribution = df['label'].value_counts()
print("Label distribution:")
print(label_distribution)

Label distribution:
label
ham     4825
spam     747
Name: count, dtype: int64


In [22]:
# 7. Analyze dataset imbalance and perform data balancing
imbalance_analysis = label_distribution
print("Imbalance analysis:")
print(imbalance_analysis)


Imbalance analysis:
label
ham     4825
spam     747
Name: count, dtype: int64


In [23]:
# 8. Identify the most frequently occurring words in the dataset
vectorizer = CountVectorizer(stop_words='english', max_features=10)
X = vectorizer.fit_transform(df['message'])
frequent_words = vectorizer.get_feature_names_out()
print("Most frequently occurring words in the entire dataset:")
print(frequent_words)

Most frequently occurring words in the entire dataset:
['free' 'good' 'gt' 'just' 'know' 'like' 'll' 'lt' 'ok' 'ur']


In [24]:
# 9. Identify the most common words in 'spam' messages
spam_messages = df[df['label'] == 'spam']['message']
spam_vectorizer = CountVectorizer(stop_words='english', max_features=10)
spam_X = spam_vectorizer.fit_transform(spam_messages)
spam_frequent_words = spam_vectorizer.get_feature_names_out()
print("Most common words in 'spam' messages:")
print(spam_frequent_words)

Most common words in 'spam' messages:
['claim' 'free' 'mobile' 'prize' 'reply' 'stop' 'text' 'txt' 'ur' 'www']


In [25]:
# 10. Identify the most common words in 'ham' messages
ham_messages = df[df['label'] == 'ham']['message']
ham_vectorizer = CountVectorizer(stop_words='english', max_features=10)
ham_X = ham_vectorizer.fit_transform(ham_messages)
ham_frequent_words = ham_vectorizer.get_feature_names_out()
print("Most common words in 'ham' messages:")
print(ham_frequent_words)

Most common words in 'ham' messages:
['good' 'got' 'gt' 'just' 'know' 'like' 'll' 'lt' 'ok' 'ur']


In [26]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...


True

In [31]:
# 1. Load the dataset
file_path = "C:\\Users\\Dell\\Desktop\\New folder\\SMSSpamCollection"
df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])

In [32]:
df['message'] = df['message'].str.lower()

In [33]:
# 3. Remove Punctuation, Special Characters, and Numbers
df['message'] = df['message'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

In [34]:
# 4. Tokenize Messages into Words (using NLTK word_tokenize, or simply split by spaces)
df['message'] = df['message'].apply(lambda x: x.split())

In [35]:
# 5. Remove Stop Words
stop_words = set(stopwords.words('english'))
df['message'] = df['message'].apply(lambda x: [word for word in x if word not in stop_words])

In [36]:
stemmer = PorterStemmer()
df['message_stemmed'] = df['message'].apply(lambda x: [stemmer.stem(word) for word in x])

In [37]:
lemmatizer = WordNetLemmatizer()
df['message_lemmatized'] = df['message'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [38]:
# 7. Convert Text to Numerical Form using TF-IDF
# Join words back together to pass into TF-IDF
df['message_joined'] = df['message_lemmatized'].apply(lambda x: ' '.join(x))
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(df['message_joined'])

In [39]:
# Check the final processed dataset and the TF-IDF result
print(df.head())
print("TF-IDF Features:")
print(tfidf_vectorizer.get_feature_names_out())

  label                                            message  \
0   ham  [go, jurong, point, crazy, available, bugis, n...   
1   ham                     [ok, lar, joking, wif, u, oni]   
2  spam  [free, entry, wkly, comp, win, fa, cup, final,...   
3   ham      [u, dun, say, early, hor, u, c, already, say]   
4   ham  [nah, dont, think, goes, usf, lives, around, t...   

                                     message_stemmed  \
0  [go, jurong, point, crazi, avail, bugi, n, gre...   
1                       [ok, lar, joke, wif, u, oni]   
2  [free, entri, wkli, comp, win, fa, cup, final,...   
3      [u, dun, say, earli, hor, u, c, alreadi, say]   
4  [nah, dont, think, goe, usf, live, around, tho...   

                                  message_lemmatized  \
0  [go, jurong, point, crazy, available, bugis, n...   
1                     [ok, lar, joking, wif, u, oni]   
2  [free, entry, wkly, comp, win, fa, cup, final,...   
3      [u, dun, say, early, hor, u, c, already, say]   
4  [nah, d