### Email Spam classification

#### Import Libraries

In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

#### Read Data

In [4]:
df = pd.read_csv("/content/emails.csv")
df.head()
df.shape
df.columns
df.drop_duplicates(inplace=True)
print(df.shape)


(5695, 2)


#### Data Preprocessing

In [5]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords

# Download required NLTK data
nltk.download('stopwords', quiet=True)

def load_and_check_data(file_path):
    """
    Load the CSV file and perform initial data checks
    """
    try:
        df = pd.read_csv('/content/emails.csv')
        print("Data loaded successfully!")
        print("\nNull values in each column:")
        print(df.isnull().sum())
        return df
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None

def process(text):
    """
    Process text by removing punctuation and stopwords
    """
    # Handle null values
    if pd.isna(text):
        return []

    # Convert to string if not already
    if not isinstance(text, str):
        text = str(text)

    # Remove punctuation
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    # Remove stopwords and convert to lowercase
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean

def process_and_show_examples(df):
    """
    Process the text column and show example results
    """
    try:
        # Process first 5 rows and show results
        print("\nExample processed texts:")
        processed_examples = df['text'].head().apply(process)
        for i, processed_text in enumerate(processed_examples, 1):
            print(f"\nEmail {i}:")
            print(processed_text)
        return processed_examples
    except KeyError:
        print("Error: 'text' column not found in dataframe")
        print("Available columns:", df.columns.tolist())
    except Exception as e:
        print(f"Error processing text: {str(e)}")

# Main execution
def main():
    # Load and check data
    df = load_and_check_data('/content/emails.csv')

    if df is not None:
        # Process text and show examples
        processed_examples = process_and_show_examples(df)

        # Add processed text as new column if needed
        # df['processed_text'] = df['text'].apply(process)

        return df, processed_examples

# Run the pipeline
df, processed_examples = main()

Data loaded successfully!

Null values in each column:
text    0
spam    0
dtype: int64

Example processed texts:

Email 1:
['Subject', 'naturally', 'irresistible', 'corporate', 'identity', 'lt', 'really', 'hard', 'recollect', 'company', 'market', 'full', 'suqgestions', 'information', 'isoverwhelminq', 'good', 'catchy', 'logo', 'stylish', 'statlonery', 'outstanding', 'website', 'make', 'task', 'much', 'easier', 'promise', 'havinq', 'ordered', 'iogo', 'company', 'automaticaily', 'become', 'world', 'ieader', 'isguite', 'ciear', 'without', 'good', 'products', 'effective', 'business', 'organization', 'practicable', 'aim', 'hotat', 'nowadays', 'market', 'promise', 'marketing', 'efforts', 'become', 'much', 'effective', 'list', 'clear', 'benefits', 'creativeness', 'hand', 'made', 'original', 'logos', 'specially', 'done', 'reflect', 'distinctive', 'company', 'image', 'convenience', 'logo', 'stationery', 'provided', 'formats', 'easy', 'use', 'content', 'management', 'system', 'letsyou', 'change

#### Feature Engineering

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
message = CountVectorizer(analyzer=process).fit_transform(df['text'])

KeyboardInterrupt: 

#### Split Data

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(message, df['spam'], test_size=0.20, random_state=0)
print(message.shape)

(5695, 37229)


#### Model Training

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(xtrain, ytrain)
print(classifier.predict(xtrain))
print(ytrain.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


#### Model Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(xtrain)
print(classification_report(ytrain, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytrain, pred))
print("Accuracy: \n", accuracy_score(ytrain, pred))

print(classifier.predict(xtest))
print(ytest.values)

pred = classifier.predict(xtest)
print(classification_report(ytest, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytest, pred))
print("Accuracy: \n", accuracy_score(ytest, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       0.99      1.00      0.99      1099

    accuracy                           1.00      4556
   macro avg       0.99      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556


Confusion Matrix: 
 [[3445   12]
 [   1 1098]]
Accuracy: 
 0.9971466198419666
[1 0 0 ... 0 0 0]
[1 0 0 ... 0 0 0]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       870
           1       0.97      1.00      0.98       269

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139


Confusion Matrix: 
 [[862   8]
 [  1 268]]
Accuracy: 
 0.9920983318700615


#### Accuracy


In [1]:
b=accuracy_score(ytest, pred)*100
print(b)

NameError: name 'accuracy_score' is not defined