In [3]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer

In [4]:
data = pd.read_csv("spam.csv", encoding = "latin-1")
data = data[['v1', 'v2']]
data = data.rename(columns = {'v1': 'label', 'v2': 'text'})
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.describe()

Unnamed: 0,label,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [7]:
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text


nltk.download('stopwords')
stop=set(stopwords.words('english'))
tokenizer=ToktokTokenizer()
stopword_list=nltk.corpus.stopwords.words('english')

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\radov\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
data['text'] = data['text'].apply(remove_special_characters)
data['text'] = data['text'].apply(simple_stemmer)
data['text'] = data['text'].apply(remove_stopwords)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size = 0.1, random_state = 1) 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))

In [12]:
cv_train_text=cv.fit_transform(X_train)
cv_test_text=cv.transform(X_test)

tv_train_text=tv.fit_transform(X_train)
tv_test_text=tv.transform(X_test)

print('Tfidf_train:',tv_train_text.shape)

Tfidf_train: (5014, 53746)


In [23]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(random_state=1, max_iter=300)
clf_bow=clf.fit(cv_train_text,y_train)
clf_tfidf=clf.fit(tv_train_text,y_train)

In [24]:
clf_bow_predict=clf_bow.predict(cv_test_text)

clf_tfidf_predict=clf_tfidf.predict(tv_test_text)

In [25]:
from sklearn.metrics import accuracy_score, confusion_matrix

clf_bow_score=accuracy_score(y_test,clf_bow_predict)
print("lr_bow_score :",clf_bow_score)

clf_tfidf_score=accuracy_score(y_test,clf_tfidf_predict)
print("lr_tfidf_score :",clf_tfidf_score)

lr_bow_score : 0.9444444444444444
lr_tfidf_score : 0.9444444444444444


In [26]:
cm_bow=confusion_matrix(y_test,clf_bow_predict)
print(cm_bow)

cm_tfidf=confusion_matrix(y_test,clf_tfidf_predict)
print(cm_tfidf)

[[482   8]
 [ 23  45]]
[[482   8]
 [ 23  45]]


In [43]:
def pred(msg):    
    msg = cv.transform([msg])    
    prediction = clf.predict(msg)    
    return prediction[0]

print(pred('Hey, how are you? Long time no see..'))
print('----')
print(pred('OMG YOU JUST WON THE LOTTERY!!! REPLY FOR FREE PRIZE IN $$$'))

ham
----
spam
