In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, classification_report

In [None]:
df = pd.read_csv('subject_spam - subject_spam.csv.csv')

In [None]:
df.sample(5)

In [None]:
df = df.dropna(subset=['Subject'])  # Drop rows with missing subjects
df['isspam'] = df['isspam'].str.strip()  # Remove leading/trailing whitespaces

In [None]:
df.info()

In [None]:
df.sample(5)

In [None]:
df.rename(columns={'subject':'text','isspam':'target'},inplace=True)
df.sample(5)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
df['target'] = encoder.fit_transform(df['target'])

In [None]:
df.head()

In [None]:
# missing values
df.isnull().sum()

In [None]:
# check for duplicate values
df.duplicated().sum()

In [None]:
df.shape

# EDA

In [None]:
df.head()

In [None]:
df = df[df['target'] != 2]

# Verify the rows have been removed
print(df['target'].value_counts())

In [None]:
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(), labels=['ham','spam'],autopct="%0.2f")
plt.show()

In [None]:
import nltk

In [None]:
!pip install nltk

In [None]:
nltk.download('punkt')

In [None]:
df['num_characters'] = df['Subject'].apply(len)

In [None]:
df.head()

In [None]:
# num of words
df['num_words'] = df['Subject'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df.head()

In [None]:
df['num_sentences'] = df['Subject'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
df[['num_characters','num_words','num_sentences']].describe()

In [None]:
# ham
df[df['target'] == 0][['num_characters','num_words','num_sentences']].describe()

In [None]:
#spam
df[df['target'] == 1][['num_characters','num_words','num_sentences']].describe()

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target'] == 0]['num_characters'])
sns.histplot(df[df['target'] == 1]['num_characters'],color='red')

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target'] == 0]['num_words'])
sns.histplot(df[df['target'] == 1]['num_words'],color='red')

In [None]:
sns.pairplot(df,hue='target')

In [None]:
sns.heatmap(df.corr(),annot=True)

## 3. Data Preprocessing
Lower case,
Tokenization,
Removing special characters,
Removing stop words and punctuation,,
Stemming,

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Initialize the Porter Stemmer
ps = PorterStemmer()

def transform_text(subject):
    subject = subject.lower()
    subject = nltk.word_tokenize(subject)
    
    y = []
    for word in subject:
        if word.isalnum():
            y.append(word)
    
    subject = y[:]
    y.clear()
    
    for word in subject:
        if word not in stopwords.words('english') and word not in string.punctuation:
            y.append(word)
            
    subject = y[:]
    y.clear()
    
    for word in subject:
        y.append(ps.stem(word))
    
    return " ".join(y)



# Apply the function to the 'Subject' column
df['transformed_subject'] = df['Subject'].apply(transform_text)

# Display the first few rows to verify the transformation
print(df.head())


In [None]:
df.head()

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [None]:
spam_text = df[df['target'] == 1]['transformed_subject'].str.cat(sep=" ")
wc = WordCloud(width=800, height=400, background_color='white').generate(spam_text)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
ham_text = df[df['target'] == 0]['transformed_subject'].str.cat(sep=" ")
ham_wc = WordCloud(width=800, height=400, background_color='white').generate(ham_text)

# Display the word cloud for non-spam emails
plt.figure(figsize=(10, 5))
plt.imshow(ham_wc, interpolation='bilinear')
plt.axis('off')
plt.title('Ham Emails Word Cloud')
plt.show()

In [None]:
df.head()

In [None]:
spam_corpus = []
for msg in df[df['target'] == 1]['transformed_subject'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [None]:
len(spam_corpus)

In [None]:
from collections import Counter
sns.barplot(pd.DataFrame(Counter(spam_corpus).most_common(30))[0],pd.DataFrame(Counter(spam_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
ham_corpus = []
for msg in df[df['target'] == 0]['transformed_subject'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
len(ham_corpus)

In [None]:
from collections import Counter
sns.barplot(pd.DataFrame(Counter(ham_corpus).most_common(30))[0],pd.DataFrame(Counter(ham_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

## Model Building

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Initialize CountVectorizer
cv = CountVectorizer()

# Initialize TfidfVectorizer
tfidf = TfidfVectorizer()


In [None]:
X = tfidf.fit_transform(df['transformed_subject']).toarray()

# Print the shape of the resulting array to see the dimensions
print(X.shape)

In [None]:
y = df['target'].values

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

In [None]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [None]:
# Voting Classifier
mnb = MultinomialNB()
rfc = RandomForestClassifier(n_estimators=50, random_state=2)

from sklearn.ensemble import VotingClassifier

In [None]:
voting = VotingClassifier(estimators=[('RF',rfc), ('nb', mnb)],voting='soft')

In [None]:
voting.fit(X_train,y_train)