## Task 1

**Import packages**

In [37]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

**Step 1: Load the dataset from this URL:**

In [9]:
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


**Step 2: Convert the 'label' column into binary values: spam = 1, ham = 0**

In [10]:
df['label'] = df.label.map({'ham':0, 'spam':1})
df

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


**Step 3: Use CountVectorizer with binary=True to transform the text into binary features**

In [16]:
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(df['message'])

In [21]:
print(vectorizer.vocabulary_)



**Step 4: Split the dataset into training and test sets (e.g., 70/30 split)**

In [19]:
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, random_state=42)

In [22]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 52150 stored elements and shape (3900, 8713)>

**Step 5: Initialize and train a BernoulliNB model**

In [24]:
model = BernoulliNB()
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,binarize,0.0
,fit_prior,True
,class_prior,


**Step 6: Predict on the test set and evaluate using accuracy and confusion matrix**

In [36]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)

print("Accuracy:", accuracy)

Accuracy: 0.9814593301435407


**Step 7: Dumpt to joblib**

In [38]:
joblib.dump(model, 'spam_classifier_model.joblib')
joblib.dump(vectorizer, 'count_vectorizer.joblib')

['count_vectorizer.joblib']