In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import spacy

In [4]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:

nlp = spacy.load('en_core_web_sm')

In [6]:
nlp

<spacy.lang.en.English at 0x7fb0cbc16d70>

In [9]:
df1 = pd.read_csv('/content/combined_dataset1(in).csv')
df1 = df1.drop_duplicates()

In [10]:
df1

Unnamed: 0,Message,Category
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham
...,...,...
5172,You have a missed call from our rewards team.,ham
5173,Just letting you know about our upcoming event.,ham
5174,This is a limited-time offer. Act now!,ham
5175,Can you send me the report by end of day?,ham


In [None]:
# Check if the column 'Label' exists in your DataFrame
print(df1.columns)

# If the column name is different (e.g., 'label', 'Labels'), correct it:
counts = df1['Category'].value_counts()  # Replace 'label' with the actual column name if necessary

# If the column is missing, you might need to revisit your CSV file or data source.
# Ensure the column containing the labels is indeed named 'Label' in your CSV file.

Index(['Message', 'Category'], dtype='object')


In [12]:
counts = df1['Category'].value_counts()

print("Ham count:", counts.get('ham', 0))
print("Spam count:", counts.get('spam', 0))

Ham count: 4533
Spam count: 644


In [16]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\W', ' ', text)
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    cleaned_text = ' '.join(tokens)
    return cleaned_text

In [17]:
# Assuming you have a text string called 'my_text'
my_text = "This is an example text with some URLs like http://example.com and some punctuation!"

# Call the function to preprocess the text
processed_text = preprocess_text(my_text)

# Print the processed text
print(processed_text)

example text url like punctuation


In [18]:
X = df1['Message']
y = df1['Category'].apply(lambda x: 1 if x == 'spam' else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [22]:
X

Unnamed: 0,Message
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."
...,...
5172,You have a missed call from our rewards team.
5173,Just letting you know about our upcoming event.
5174,This is a limited-time offer. Act now!
5175,Can you send me the report by end of day?


In [20]:
y

Unnamed: 0,Category
0,0
1,0
2,1
3,0
4,0
...,...
5172,0
5173,0
5174,0
5175,0


In [25]:
df1.head()

Unnamed: 0,Message,Category
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [26]:
df1.tail()

Unnamed: 0,Message,Category
5172,You have a missed call from our rewards team.,ham
5173,Just letting you know about our upcoming event.,ham
5174,This is a limited-time offer. Act now!,ham
5175,Can you send me the report by end of day?,ham
5176,Congratulations! You've won a free prize.,spam


In [27]:
pipeline = make_pipeline(TfidfVectorizer(stop_words='english'), LogisticRegression())


In [28]:
pipeline.fit(X_train, y_train)


In [29]:
y_pred = pipeline.predict(X_test)

In [30]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [31]:
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)


In [32]:
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)

Accuracy: 0.9523809523809523
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1356
           1       0.97      0.65      0.78       198

    accuracy                           0.95      1554
   macro avg       0.96      0.82      0.87      1554
weighted avg       0.95      0.95      0.95      1554



In [33]:
new_data = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."]
cleaned_new_data = [preprocess_text(text) for text in new_data]
new_data_predictions = pipeline.predict(cleaned_new_data)  # Use pipeline directly
predicted_labels = ['spam' if pred == 1 else 'ham' for pred in new_data_predictions]

print("New Data Predictions:", predicted_labels)

New Data Predictions: ['spam', 'ham']


In [None]:
!pip install scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline





In [None]:
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model (e.g., using accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9797211660329531


In [None]:
classification_rep = classification_report(y_test, y_pred)


In [None]:
print("Classification Report:")
print(classification_rep)

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1362
           1       0.95      0.90      0.92       216

    accuracy                           0.98      1578
   macro avg       0.97      0.95      0.96      1578
weighted avg       0.98      0.98      0.98      1578



In [43]:
# Create a pipeline with CountVectorizer and MultinomialNB
pipeline = make_pipeline(TfidfVectorizer(stop_words='english'), MultinomialNB())

In [44]:
pipeline.fit(X_train, y_train)


In [46]:
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model (e.g., using accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9755469755469756


In [47]:
new_data = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."]
cleaned_new_data = [preprocess_text(text) for text in new_data]
new_data_predictions = pipeline.predict(cleaned_new_data)  # Use pipeline directly
predicted_labels = ['spam' if pred == 1 else 'ham' for pred in new_data_predictions]

print("New Data Predictions:", predicted_labels)

New Data Predictions: ['spam', 'ham']


In [45]:
pipeline = make_pipeline(TfidfVectorizer(stop_words='english'),SVC()) # Using LinearSVC for SVM

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model (e.g., using accuracy)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9755469755469756


In [39]:
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_rep)


Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1356
           1       0.98      0.83      0.90       198

    accuracy                           0.98      1554
   macro avg       0.98      0.91      0.94      1554
weighted avg       0.98      0.98      0.97      1554



In [40]:
pipeline.fit(X_train, y_train)

In [41]:
new_data = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."]
cleaned_new_data = [preprocess_text(text) for text in new_data]
new_data_predictions = pipeline.predict(cleaned_new_data)  # Use pipeline directly
predicted_labels = ['spam' if pred == 1 else 'ham' for pred in new_data_predictions]

print("New Data Predictions:", predicted_labels)

New Data Predictions: ['spam', 'ham']


In [None]:
pipeline = make_pipeline(TfidfVectorizer(stop_words='english'),
                        RandomForestClassifier(random_state=42))  # Using RandomForestClassifier

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model (e.g., using accuracy)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9619771863117871


In [None]:
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_rep)


Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1362
           1       0.98      0.74      0.84       216

    accuracy                           0.96      1578
   macro avg       0.97      0.87      0.91      1578
weighted avg       0.96      0.96      0.96      1578



In [None]:
pipeline.fit(X_train, y_train)

In [34]:
pipeline = make_pipeline(TfidfVectorizer(stop_words='english'),
                        KNeighborsClassifier(n_neighbors=5))  # Using KNeighborsClassifier

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model (e.g., using accuracy)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.898970398970399


In [35]:
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_rep)


Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      1356
           1       1.00      0.21      0.34       198

    accuracy                           0.90      1554
   macro avg       0.95      0.60      0.64      1554
weighted avg       0.91      0.90      0.87      1554



In [36]:
pipeline.fit(X_train, y_train)

In [37]:
new_data = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."]
cleaned_new_data = [preprocess_text(text) for text in new_data]
new_data_predictions = pipeline.predict(cleaned_new_data)  # Use pipeline directly
predicted_labels = ['spam' if pred == 1 else 'ham' for pred in new_data_predictions]

print("New Data Predictions:", predicted_labels)

New Data Predictions: ['ham', 'ham']
