<a href="https://colab.research.google.com/github/sanjjey/GrowthLink_assignment/blob/main/ML_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install nlpaug nltk pandas scikit-learn



In [6]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")
path=path+"/spam.csv"
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/sms-spam-collection-dataset/spam.csv


In [16]:
import pandas as pd
import nlpaug.augmenter.word as naw
import nltk
from sklearn.utils import resample

# Download necessary resources for nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')


# Load the CSV file
df = pd.read_csv(path,encoding="utf-8",encoding_errors="ignore")
#Separating ham and spam

ham_df = df[df['v1'] == 'ham']
spam_df = df[df['v1'] == 'spam']

#Sampling the minority to equalize the majority(oversampling)
n_ham = len(ham_df)
n_spam = len(spam_df)
n_to_generate = n_ham - n_spam

# Initialize NLP augmentation using wordnet
aug = naw.SynonymAug(aug_src='wordnet')

# Generate synthetic spam samples
augmented_texts = []
original_texts = spam_df['v2'].tolist()

i = 0
while len(augmented_texts) < n_to_generate:
    aug_text = aug.augment(original_texts[i % len(original_texts)])
    augmented_texts.append(aug_text)
    i += 1

# Create a DataFrame for the synthetic spam data
augmented_spam_df = pd.DataFrame({'v1': ['spam'] * len(augmented_texts), 'v2': augmented_texts})

# Combine with the original data
balanced_df = pd.concat([ham_df, spam_df, augmented_spam_df]).sample(frac=1).reset_index(drop=True)

# Save the new balanced dataset
balanced_df.to_csv('balanced_data.csv', index=False)

print(f"Original spam: {n_spam}, Generated spam: {len(augmented_spam_df)}, Total: {len(balanced_df)}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Original spam: 747, Generated spam: 4078, Total: 9650


In [17]:
dataset=pd.read_csv("balanced_data.csv")#dataset loaded

In [18]:
dataset['v1'].value_counts()#checking if the data sampling worked

Unnamed: 0_level_0,count
v1,Unnamed: 1_level_1
spam,4825
ham,4825


In [22]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from nltk.tokenize import word_tokenize


df=dataset

# Preprocess text data: Tokenize and vectorize
nltk.download('punkt_tab')

# Tokenize the text
df['tokens'] = df['v2'].apply(lambda x: word_tokenize(x.lower()))

# Label encoding for target variable
label_encoder = LabelEncoder()
df['v1'] = label_encoder.fit_transform(df['v1'])

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(df['tokens'], df['v1'], test_size=0.2, random_state=42)

# TF-IDF Vectorization for XGBoost
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train.apply(lambda x: ' '.join(x)))
X_test_tfidf = vectorizer.transform(X_test.apply(lambda x: ' '.join(x)))


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [23]:
# Train XGBoost classifier
xgb_model = xgb.XGBClassifier(eval_metric='mlogloss')
xgb_model.fit(X_train_tfidf, y_train)

# Predictions
xgb_preds = xgb_model.predict(X_test_tfidf)

# Evaluate XGBoost
xgb_accuracy = accuracy_score(y_test, xgb_preds)
print(f'XGBoost Accuracy: {xgb_accuracy:.4f}')


XGBoost Accuracy: 0.9865
