In [1]:
from collections import Counter

In [1]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!unzip smsspamcollection.zip && rm smsspamcollection.zip

--2017-01-21 10:55:51--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Résolution de archive.ics.uci.edu (archive.ics.uci.edu)… 128.195.10.249
Connexion à archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:443… connecté.
requête HTTP transmise, en attente de la réponse… 200 OK
Taille : 203415 (199K) [application/zip]
Sauvegarde en : « smsspamcollection.zip.1 »


2017-01-21 10:55:54 (127 KB/s) — « smsspamcollection.zip.1 » sauvegardé [203415/203415]

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  
presentation_nlp_python.md  SMSSpamCollection	     'SMS Spam Detection.ipynb'
readme			    smsspamcollection.zip.1


In [2]:
!ls

presentation_nlp_python.md  SMSSpamCollection	     'SMS Spam Detection.ipynb'
readme			    smsspamcollection.zip.1


In [2]:
data = []
target_names = []
target = []

with open('SMSSpamCollection', 'r') as f:
    for line in f:
        label, text = line.split(maxsplit=1)
        
        try:
            target_index = target_names.index(label)
            
        except ValueError:
            target_names.append(label)
            target_index = len(target_names) - 1
        
        target.append(target_index)
        data.append(text)

In [3]:
print("Available targets: {}".format(target_names))
print("Total number of items: {}".format(len(data)))
print("Number of items in each class: {}".format(Counter(target)))

Available targets: ['ham', 'spam']
Total number of items: 5574
Number of items in each class: Counter({0: 4827, 1: 747})


In [4]:
print("Class: '{}'\ntext: '{}'".format(target_names[target[0]], data[0]))

Class: 'ham'
text: 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
'


## Train-Test Split

We need to split our data between the *training set* and the *test set*.

In [7]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target)
print("Length of X_train ({}), X_test ({}), "
      "y_train ({}), y_test ({})".format(len(X_train), len(X_test), len(y_train), len(y_test)))

Length of X_train (4180), X_test (1394), y_train (4180), y_test (1394)


## Bag of words representation

In order to perform text classification, we have to turn the text documents into numerical feature vectors that can be used by the classification algorithm. The easiest way is to use a **bag of words representation**.

- assign a fixed integer `j` to each word occuring in the training set
- for each document `i`, count the number of occurrences of word `w` and put it in $X[i, j]$ where `j` is the id of word `w`.

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
X_train_counts = vec.fit_transform(X_train)
print(X_train_counts.shape)

(4180, 7483)


In [10]:
dict(x for idx, x in enumerate(vec.vocabulary_.items()) if idx < 10)

{'alone': 906,
 'bro': 1480,
 'camcorder': 1585,
 'ducking': 2419,
 'moves': 4462,
 'pai': 4883,
 'purse': 5325,
 'relatives': 5487,
 'trade': 6775,
 'yuou': 7470}

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer()
X_train_tf = tf_transformer.fit_transform(X_train_counts)

In [12]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train_tf, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [13]:
X_test_counts = vec.transform(X_test)
X_test_tf = tf_transformer.transform(X_test_counts)
predictions = clf.predict(X_test_tf)
print(Counter(predictions))

Counter({0: 1262, 1: 132})


In [14]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions, target_names=target_names))

             precision    recall  f1-score   support

        ham       0.96      1.00      0.98      1217
       spam       0.98      0.73      0.84       177

avg / total       0.97      0.96      0.96      1394

