# Defining applications for data science

http://scikit-learn.org/stable/developers/<BR>
http://scikit-learn.org/stable/faq.html<BR>

In [1]:
from sklearn.datasets import load_boston
boston = load_boston()
X, y = boston.data,boston.target
print X.shape, y.shape

(506, 13) (506,)


In [2]:
print X[1], y[0]

[  2.73100000e-02   0.00000000e+00   7.07000000e+00   0.00000000e+00
   4.69000000e-01   6.42100000e+00   7.89000000e+01   4.96710000e+00
   2.00000000e+00   2.42000000e+02   1.78000000e+01   3.96900000e+02
   9.14000000e+00] 24.0


In [5]:
from sklearn.linear_model import LinearRegression
hypothesis = LinearRegression(normalize=True)
hypothesis.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [6]:
print hypothesis.coef_

[ -1.07170557e-01   4.63952195e-02   2.08602395e-02   2.68856140e+00
  -1.77957587e+01   3.80475246e+00   7.51061703e-04  -1.47575880e+00
   3.05655038e-01  -1.23293463e-02  -9.53463555e-01   9.39251272e-03
  -5.25466633e-01]


In [7]:
print len(hypothesis.coef_)

13


In [10]:
print hypothesis.score(X,y)

0.740607742865


In [11]:
import numpy as np
new_observation = np.array([1,0,1,0,0.5,7,59,6,3,200,20,350,4],dtype=float)
print len(new_observation)
print hypothesis.predict(new_observation)

13
[ 25.8972784]




In [12]:
hypothesis.score(X,y)

0.74060774286494313

In [None]:
#help(LinearRegression)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(X)
print scaler.transform(new_observation)

# Performing the Hashing Trick

## Using hash functions

In [13]:
print hash('Python')
print abs(hash('Python')) % 1000

-2359742753373747800
800


## Demonstrating the hashing trick

In [14]:
string_1 = 'Python for data science'
string_2 = 'Python for machine learning'

def hashing_trick(input_string, vector_size=20):
    feature_vector = [0] * vector_size
    for word in input_string.split(' '):
        index = abs(hash(word)) % vector_size
        feature_vector[index] = 1
    return feature_vector

print hashing_trick(input_string='Python for data science', vector_size=20)
print hashing_trick(input_string='Python for machine learning', vector_size=20)

[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [15]:
from scipy.sparse import csc_matrix
print csc_matrix([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

  (0, 0)	1
  (0, 5)	1
  (0, 16)	1
  (0, 18)	1


In [16]:
# http://scikit-learn.org/stable/modules/feature_extraction.html
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html
from sklearn.feature_extraction.text import HashingVectorizer
sklearn_hashing_trick = HashingVectorizer(n_features=20, binary=True, norm=None)
hashed_text = sklearn_hashing_trick.transform(['Python for data science','Python for machine learning'])
hashed_text

<2x20 sparse matrix of type '<type 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
one_hot_enconder = CountVectorizer()
one_hot_enconded = one_hot_enconder.fit_transform(['Python for data science','Python for machine learning'])

In [18]:
print one_hot_enconder.vocabulary_

{u'machine': 3, u'learning': 2, u'for': 1, u'python': 4, u'science': 5, u'data': 0}


In [19]:
sklearn_hashing_trick.transform(['New text has arrived'])

<1x20 sparse matrix of type '<type 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [20]:
one_hot_enconder.fit_transform(['New text has arrived'])

<1x4 sparse matrix of type '<type 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

HashingVectorizer is the perfect function to use when your data can’t fit
into memory and its features aren’t fixed. In the other cases, consider using
the more intuitive CountVectorizer.

# Performance testing

In [21]:
%timeit l = [k for k in range(10**6)]

10 loops, best of 3: 56.3 ms per loop


In [22]:
%timeit -n 20 -r 5 l = [k for k in range(10**6)]

20 loops, best of 5: 55.9 ms per loop


In [23]:
%%timeit 
l = list()
for k in range(10**6):
    l.append(k)

10 loops, best of 3: 101 ms per loop


In [24]:
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
sklearn_hashing_trick = HashingVectorizer(n_features=20, binary=True, norm=None) 
one_hot_enconder = CountVectorizer()
texts = ['Python for data science','Python for machine learning']

In [25]:
%timeit one_hot_enconded = one_hot_enconder.fit_transform(texts)

1000 loops, best of 3: 1.05 ms per loop


In [26]:
%timeit  hashing = sklearn_hashing_trick.transform(texts)

10000 loops, best of 3: 115 µs per loop


In [27]:
import timeit
cumulative_time = timeit.timeit("hashing = sklearn_hashing_trick.transform(texts)", 
                                 "from __main__ import sklearn_hashing_trick, texts", 
                                 number=10000)
print cumulative_time / 10000.0

0.000127188515663


## Memory profiler

In [29]:
# Installation procedures from the command line:
# pip install psutil
# pip install memory_profiler

SyntaxError: invalid syntax (<ipython-input-29-9f1b822f66e5>, line 2)

In [35]:
# Initialization from IPython (to be repeat at every IPython start)
%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [36]:
hashing = sklearn_hashing_trick.transform(texts)
%memit dense_hashing = hashing.toarray()

peak memory: 99.32 MiB, increment: 0.00 MiB


In [37]:
%%writefile example_code.py
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
def comparison_test():
    sklearn_hashing_trick = HashingVectorizer(n_features=20, binary=True, norm=None) 
    one_hot_enconder = CountVectorizer()
    texts = ['Python for data science','Python for machine learning']
    one_hot_enconded = one_hot_enconder.fit_transform(texts)
    hashing = sklearn_hashing_trick.transform(texts)

Overwriting example_code.py


In [38]:
from example_code import comparison_test
%mprun -f comparison_test comparison_test()

('',)


# Demonstrating multiprocessing techniques

In [45]:
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data,digits.target
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
print X.shape, y.shape

(1797, 64) (1797,)


In [46]:
%timeit single_core_learning = cross_val_score(SVC(), X, y, cv=20, n_jobs=1)

1 loops, best of 3: 11.3 s per loop


In [47]:
%timeit multi_core_learning = cross_val_score(SVC(), X, y, cv=20, n_jobs=-1)

1 loops, best of 3: 4.05 s per loop
