In [1]:
import pandas as pd

In [2]:
path = 'SMSSpamCollection'
df = pd.read_csv(path, sep='\t', names = ['label','text'])
df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [4]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
import numpy as  np
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    val = np.round(count/(len(text) - text.count(" ")),3) * 100
    return(val)

In [6]:
count_punct('Hello, how are you? Good bye!!!')

19.2

In [7]:
df['punct%'] = df['text'].apply(lambda x: count_punct(x))
df.head()

Unnamed: 0,label,text,punct%
0,ham,"Go until jurong point, crazy.. Available only ...",9.8
1,ham,Ok lar... Joking wif u oni...,25.0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,4.7
3,ham,U dun say so early hor... U c already then say...,15.4
4,ham,"Nah I don't think he goes to usf, he lives aro...",4.1


In [8]:
df['body_len'] = df['text'].apply(lambda x: len(x) - x.count(' '))
df.head()

Unnamed: 0,label,text,punct%,body_len
0,ham,"Go until jurong point, crazy.. Available only ...",9.8,92
1,ham,Ok lar... Joking wif u oni...,25.0,24
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,4.7,128
3,ham,U dun say so early hor... U c already then say...,15.4,39
4,ham,"Nah I don't think he goes to usf, he lives aro...",4.1,49


In [9]:
import re # Regular expression package

from nltk.corpus import stopwords
swords = stopwords.words('english')

from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [10]:
def clean_text(text):
    words = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', words)
    text = [ps.stem(word) for word in tokens if word not in swords]
    return text

In [11]:
clean_text('Hello, how are you? Are you listening me?|')

['hello', 'are', 'listen']

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(analyzer=clean_text)
tvect_obj = tvect.fit(df['text'])
tvect_text = tvect_obj.transform(df['text'])
tvect_text

<5572x8193 sparse matrix of type '<class 'numpy.float64'>'
	with 55674 stored elements in Compressed Sparse Row format>

In [13]:
df_vect = pd.concat([df[['body_len', 'punct%']],
                   pd.DataFrame(tvect_text.toarray())], axis=1)
df_vect.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8183,8184,8185,8186,8187,8188,8189,8190,8191,8192
0,92,9.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,39,15.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_vect, df['label'], random_state=0)

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [16]:
clf = RandomForestClassifier(random_state=0, n_estimators=300)
clf_dt = DecisionTreeClassifier(random_state=0)

In [17]:
clf.fit(X_train, y_train)
# clf_dt.fit(X_train, y_train)

RandomForestClassifier(n_estimators=300, random_state=0)

In [18]:
y_pred = clf.predict(X_test)
# y_pred = clf_dt.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test) * 100

98.06173725771716

In [20]:
path = 'sample.csv'
sample = pd.read_csv(path, names=['text'], sep='\t')
sample

Unnamed: 0,text
0,Ok lar i double check wif da hair dresser alre...
1,"As a valued customer, I am pleased to advise y..."
2,"Today is ""song dedicated day.."" Which song wil..."


In [21]:
sample['body_len'] = sample['text'].apply(lambda x: len(x) - x.count(" "))
sample['Punct%'] = sample['text'].apply(lambda x: count_punct(x))
sample.head()

Unnamed: 0,text,body_len,Punct%
0,Ok lar i double check wif da hair dresser alre...,89,4.5
1,"As a valued customer, I am pleased to advise y...",125,2.4
2,"Today is ""song dedicated day.."" Which song wil...",102,9.8


In [22]:
sample_vect = tvect_obj.transform(sample['text'])

In [23]:
sample_vect = pd.concat([sample[['body_len', 'Punct%']],
                   pd.DataFrame(sample_vect.toarray())], axis=1)

In [24]:
sample_vect.shape

(3, 8195)

In [25]:
clf.predict(sample_vect)

array(['ham', 'spam', 'ham'], dtype=object)