In [None]:
Creme¶
With creme, we encourage a different approach, which is to continuously learn a stream of data. 
This means that the model process one observation at a time, and can therefore be updated on the fly. 
This allows to learn from massive datasets that don't fit in main memory. Online machine learning also integrates nicely in cases where new data is constantly arriving. 
It shines in many use cases, such as time series forecasting, spam filtering, 
recommender systems, CTR prediction, and IoT applications. If you're bored with retraining models and want to instead build dynamic models, 
then online machine learning (and therefore creme!) might be what you're looking for.

Here are some benefits of using creme (and online machine learning in general):

1 Incremental: models can update themselves in real-time.
2 Adaptive: models can adapt to concept drift.
3 Production-ready: working with data streams makes it simple to replicate production scenarios during model development.
4 Efficient: models don't have to be retrained and require little compute power, which lowers their carbon footprint
5 Fast: when the goal is to learn and predict with a single instance at a time, then creme is a order of magnitude faster than PyTorch, Tensorflow, and scikit-learn.

In [2]:
pip install creme

Collecting creme
[?25l  Downloading https://files.pythonhosted.org/packages/ce/8f/95044edac0127f71251a187ae1be0fe0e9bab24050cd6ab29210f078b179/creme-0.6.1-cp36-cp36m-manylinux2010_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 2.7MB/s 
[?25hCollecting mmh3==2.5.1
  Downloading https://files.pythonhosted.org/packages/fa/7e/3ddcab0a9fcea034212c02eb411433db9330e34d626360b97333368b4052/mmh3-2.5.1.tar.gz
Building wheels for collected packages: mmh3
  Building wheel for mmh3 (setup.py) ... [?25l[?25hdone
  Created wheel for mmh3: filename=mmh3-2.5.1-cp36-cp36m-linux_x86_64.whl size=37846 sha256=12d978902b417a484547c9cc57fa69259796d5d3d9690f7c968135ab113c153c
  Stored in directory: /root/.cache/pip/wheels/38/b4/ea/6e4e321c625d3320c0c496bf4088371546d8fce5f1dd71b219
Successfully built mmh3
Installing collected packages: mmh3, creme
Successfully installed creme-0.6.1 mmh3-2.5.1


In [3]:
import math
from creme import compose
from creme import feature_extraction
from creme import naive_bayes
import creme

In [4]:
docs = [   ('Chinese Beijing Chinese', 'yes'),
 ('Chinese Chinese Shanghai', 'yes'),
('Chinese Macao', 'yes'),
('Tokyo Japan Chinese', 'no')
]

In [7]:
### This is just an example
corpus = [
         'This is the first document.',
        'This document is the second document.',
        'And this is the third one.',
        'Is this the first document?',
    ]
bow = creme.feature_extraction.TFIDF()
for sentence in corpus:
    print(bow.transform_one(sentence))

{'this': 0.4472135954999579, 'is': 0.4472135954999579, 'the': 0.4472135954999579, 'first': 0.4472135954999579, 'document': 0.4472135954999579}
{'this': 0.35355339059327373, 'document': 0.7071067811865475, 'is': 0.35355339059327373, 'the': 0.35355339059327373, 'second': 0.35355339059327373}
{'and': 0.408248290463863, 'this': 0.408248290463863, 'is': 0.408248290463863, 'the': 0.408248290463863, 'third': 0.408248290463863, 'one': 0.408248290463863}
{'is': 0.4472135954999579, 'this': 0.4472135954999579, 'the': 0.4472135954999579, 'first': 0.4472135954999579, 'document': 0.4472135954999579}


In [8]:
model = compose.Pipeline(
    ('tokenize', feature_extraction.BagOfWords(lowercase=False)),
    ('nb', naive_bayes.MultinomialNB(alpha=1)))

In [9]:
%%time
for sentence, label in docs:
     model = model.fit_one(sentence, label)

CPU times: user 663 µs, sys: 0 ns, total: 663 µs
Wall time: 667 µs


In [10]:
new_unseen_text = 'Tokyo india'

In [11]:
model.predict_one(new_unseen_text)

'no'

In [12]:
# Training on a new data and new category

model.fit_one('India USA','may be')

Pipeline (
  BagOfWords (
    on=None
    strip_accents=True
    lowercase=False
    preprocessor=None
    tokenizer=<built-in method findall of _sre.SRE_Pattern object at 0x7f792feb14e0>
    ngram_range=(1, 1)
  ),
  MultinomialNB (
    alpha=1
  )
)

In [13]:
model.predict_one("India Germany")

'may be'

In [14]:

from creme import compose
from creme import datasets
from creme import linear_model
from creme import metrics
from creme import preprocessing

X_y = datasets.Phishing()

model = compose.Pipeline(
   preprocessing.StandardScaler(),
 linear_model.LogisticRegression()
)
metric = metrics.Accuracy()


for x, y in X_y:
    y_pred = model.predict_one(x)      # make a prediction
    metric = metric.update(y, y_pred)  # update the metric
    model = model.fit_one(x, y)        # make the model learn

metric

Accuracy: 89.20%

In [15]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving spam.csv to spam.csv
User uploaded file "spam.csv" with length 503663 bytes


In [16]:
import pandas as pd 
import numpy as np


In [18]:
message=pd.read_csv('spam.csv',encoding='latin-1')

In [19]:
message.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [20]:
del message['Unnamed: 2']

In [21]:
del message['Unnamed: 3']
del message['Unnamed: 4']

In [22]:
message.shape

(5572, 2)

In [23]:
from sklearn.model_selection import train_test_split
message_train,message_test=train_test_split(message)

In [24]:
message_train

Unnamed: 0,v1,v2
1843,ham,Super da:)good replacement for murali
2545,ham,So are you guys asking that i get that slipper...
898,ham,"Thursday night? Yeah, sure thing, we'll work i..."
813,ham,I borrow ur bag ok.
720,ham,Oh is it? Send me the address
...,...,...
5416,ham,My slave! I want you to take 2 or 3 pictures o...
3441,spam,Save money on wedding lingerie at www.bridal.p...
5223,ham,If I die I want u to have all my stuffs.
650,ham,Thats cool! Sometimes slow and gentle. Sonetim...


In [25]:

# Convert dataframe to list of tuples
messages_train = message_train.to_records(index=False)
messages_test=message_test.to_records(index=False)

In [26]:
messages_test

rec.array([('ham', 'Then i buy.'),
           ('ham', 'The evo. I just had to download flash. Jealous?'),
           ('ham', 'Yup i thk they r e teacher said that will make my face look longer. Darren ask me not 2 cut too short.'),
           ..., ('ham', 'Otherwise had part time job na-tuition..'),
           ('ham', 'And of course you should make a stink!'),
           ('ham', "No it's waiting in e car dat's bored wat. Cos wait outside got nothing 2 do. At home can do my stuff or watch tv wat.")],
          dtype=[('v1', 'O'), ('v2', 'O')])

In [27]:
# Creating the pipeline
# 1st function is creating the TFIDF
# 2nd function is the naive bayes predictor
import math
from creme import compose
from creme import feature_extraction
from creme import naive_bayes
import creme
model = compose.Pipeline(
    ('tokenize', feature_extraction.TFIDF(lowercase=False)),
    ('nb', naive_bayes.MultinomialNB(alpha=1))
)

In [28]:
from creme import metrics
metric=metrics.Accuracy()
# Training the model row by row
for label,sentence in messages_train:
    model = model.fit_one(sentence, label)
    y_pred = model.predict_one(sentence)
    metric = metric.update(label, y_pred)

In [29]:

### Training Data Accuracy
metric

Accuracy: 95.48%

In [30]:
### test Data Accuracy
test_metric=metrics.Accuracy()
for label,sentence in messages_test:
    y_pred = model.predict_one(sentence)
    test_metric = metric.update(label, y_pred)

In [31]:

### test Metric
test_metric

Accuracy: 95.64%

In [32]:

model.fit_one("This guy is neutral", "ham")
model.fit_one("Everybody is neutral ", "ham")

Pipeline (
  TFIDF (
    normalize=True
    on=None
    strip_accents=True
    lowercase=False
    preprocessor=None
    tokenizer=<built-in method findall of _sre.SRE_Pattern object at 0x7f792feb15a8>
    ngram_range=(1, 1)
  ),
  MultinomialNB (
    alpha=1
  )
)

In [33]:
### test Metric
test_metric


Accuracy: 95.64%

In [34]:
### Training Data Accuracy
metric

Accuracy: 95.64%