In [2]:
from sklearn.feature_extraction.text import CountVectorizer 
import pandas as pd

In [3]:
# Sample dataset
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']


In [4]:
#initialize the Vectorizer
vect = CountVectorizer()


In [5]:
# learn the vocab and parse them as features based on the given params.
vect.fit(simple_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
# get the feature names
vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

In [7]:
# convert to a document-term matrix
dtm = vect.transform(simple_train)
dtm

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [8]:
# turn it into an array
dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

In [9]:
# convert the array into a df
df = pd.DataFrame(dtm.toarray(),columns=vect.get_feature_names()) 
df

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [10]:
# check the datatype of the dtm 
type(dtm)

scipy.sparse.csr.csr_matrix

In [11]:
# sparse matrix contains only values where there are non zeros.
print(dtm)

  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2


In [3]:
# read the text dataset
path = 'data/sms.tsv'
sms=pd.read_table(path,header=None,names=['label', 'message'])
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# check the shape
sms.shape

(5573, 2)

In [5]:
# diplay based on the categorizations available
sms.label.value_counts()

ham     4826
spam     747
Name: label, dtype: int64

In [6]:
# convert spams to 1 hams to 0
sms['labels_converted']=sms['label'].apply(lambda x:1 if x=="spam" else 0)

In [7]:
# check if conversion happened.
sms.head()

Unnamed: 0,label,message,labels_converted
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
# get sms message dimensions
X = sms['message']
y = sms['labels_converted']
print(X.shape)
print(y.shape)

(5573,)
(5573,)


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y , random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1394,)
(4179,)
(1394,)


In [10]:
vect = CountVectorizer()

In [11]:
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm

<4179x7455 sparse matrix of type '<class 'numpy.int64'>'
	with 55243 stored elements in Compressed Sparse Row format>

In [12]:
X_train_dtm.toarray().shape

(4179, 7455)

In [13]:
X_test_dtm = vect.transform(X_test)
X_test_dtm

<1394x7455 sparse matrix of type '<class 'numpy.int64'>'
	with 17550 stored elements in Compressed Sparse Row format>

In [14]:
X_test_dtm.toarray().shape

(1394, 7455)

In [15]:
# adding naive bayes
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [16]:
# fit using Magic Command
%time nb.fit(X_train_dtm, y_train)

Wall time: 2.98 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
y_pred_class = nb.predict(X_test_dtm)
y_pred_class

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [18]:
from sklearn import metrics
metrics.accuracy_score(y_test,y_pred_class)


0.98708751793400284

In [19]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[1186,    2],
       [  16,  190]])

In [20]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=1)

In [21]:
clf.fit(X_train_dtm,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [24]:
preds = clf.predict(X_test_dtm)
preds

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [25]:
preds.shape

(1394,)

In [27]:
metrics.accuracy_score(y_test,preds)

1.0

In [28]:
metrics.confusion_matrix(y_test,preds)

array([[1188,    0],
       [   0,  206]])