In [1]:
! pip install kaggle



In [2]:
! mkdir ~/.kaggle

In [3]:
! cp kaggle.json ~/.kaggle/

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
! kaggle datasets download uciml/sms-spam-collection-dataset/

Downloading sms-spam-collection-dataset.zip to /content
100% 211k/211k [00:00<00:00, 383kB/s]
100% 211k/211k [00:00<00:00, 383kB/s]


In [7]:
!unzip -q /content/sms-spam-collection-dataset.zip

In [8]:
import pandas as pd
import nltk
import numpy as np
import re

from nltk.corpus import stopwords


df = pd.read_csv("/content/spam.csv",encoding='latin-1')
df.head(5900)


dataset_size = len(df)
num_features = len(df.columns)

print("Dataset size:", dataset_size)
print("Number of features:", num_features)

df = df[['v2','v1']]
df.rename(columns={'v2':'messages','v1':'Label'},inplace=True)
df.head(5572)


Dataset size: 5572
Number of features: 5


Unnamed: 0,messages,Label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,spam
5568,Will Ì_ b going to esplanade fr home?,ham
5569,"Pity, * was in mood for that. So...any other s...",ham
5570,The guy did some bitching but I acted like i'd...,ham


In [9]:
df.isnull().sum()

messages    0
Label       0
dtype: int64

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
Stopwords = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^0-9a-zA-Z]',' ',text)
    #remove extra spaces
    text = re.sub(r'\s+',' ',text)
    #remove stopwords
    text = "".join (word for word in text.split() if word not in Stopwords)
    return text

In [13]:
df['clean_text']= df['messages'].apply(clean_text)
df.head()

Unnamed: 0,messages,Label,clean_text
0,"Go until jurong point, crazy.. Available only ...",ham,gojurongpointcrazyavailablebugisngreatworldlae...
1,Ok lar... Joking wif u oni...,ham,oklarjokingwifuoni
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,freeentry2wklycompwinfacupfinaltkts21stmay2005...
3,U dun say so early hor... U c already then say...,ham,udunsayearlyhorucalreadysay
4,"Nah I don't think he goes to usf, he lives aro...",ham,nahthinkgoesusflivesaroundthough


In [14]:
X = df['clean_text']
y = df['Label']

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

def classify(model, X, y):
    # train test split
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)
    # model training
    pipeline_model = Pipeline([('vect', CountVectorizer()),
                               ('tfidf',TfidfTransformer()),
                               ('clf', model)])
    pipeline_model.fit(x_train, y_train)

    print('Accuracy:', pipeline_model.score(x_test, y_test)*100)


    #y_pred = pipeline_model.predict(x_test)
    # print(classification_report(y_test, y_pred))


In [16]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, X, y)

Accuracy: 86.57573582196699


In [17]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
classify(model, X, y)

Accuracy: 86.71931083991386


In [18]:
from sklearn.svm import SVC
model = SVC(C=3)
classify(model, X, y)

Accuracy: 89.44723618090453


In [19]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model, X, y)

Accuracy: 89.44723618090453
