In [1]:
# Extracting features from categorical variables

In [2]:
from sklearn.feature_extraction import DictVectorizer

In [3]:
onehot_encoder = DictVectorizer()

In [4]:
X = [
{'city': 'New York'},
{'city': 'San Francisco'},
{'city': 'Chapel Hill'}
]

In [5]:
type(X)

list

In [6]:
print(onehot_encoder.fit_transform(X).toarray())

[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


In [7]:
# Standardizing features

In [8]:
from sklearn import preprocessing
import numpy as np

In [9]:
X = np.array([
[0., 0., 5., 13., 9., 1.],
[0., 0., 13., 15., 10., 15.],
[0., 3., 15., 2., 0., 11.]
])

In [10]:
X.ndim

2

In [11]:
print(preprocessing.scale(X))

[[ 0.         -0.70710678 -1.38873015  0.52489066  0.59299945 -1.35873244]
 [ 0.         -0.70710678  0.46291005  0.87481777  0.81537425  1.01904933]
 [ 0.          1.41421356  0.9258201  -1.39970842 -1.4083737   0.33968311]]


In [12]:
# Extracting features from text

In [13]:
# The bag-of-words model

In [14]:
corpus = [
'UNC played Duke in basketball',
'Duke lost the basketball game'
]

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[1 1 0 1 0 1 0 1]
 [1 1 1 0 1 0 1 0]]
{'unc': 7, 'played': 5, 'duke': 1, 'in': 3, 'basketball': 0, 'lost': 4, 'the': 6, 'game': 2}


In [16]:
corpus.append('I ate a sandwich')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]]
{'unc': 9, 'played': 6, 'duke': 2, 'in': 4, 'basketball': 1, 'lost': 5, 'the': 8, 'game': 3, 'ate': 0, 'sandwich': 7}


In [18]:
from sklearn.metrics.pairwise import euclidean_distances
X = vectorizer.fit_transform(corpus).todense()
print('Distance between 1st and 2nd documents:',euclidean_distances(X[0], X[1]))
print('Distance between 1st and 3rd documents:',euclidean_distances(X[0], X[2]))
print('Distance between 2nd and 3rd documents:',euclidean_distances(X[1], X[2]))


Distance between 1st and 2nd documents: [[2.44948974]]
Distance between 1st and 3rd documents: [[2.64575131]]
Distance between 2nd and 3rd documents: [[2.64575131]]


In [19]:
# Stop word filtering

In [20]:
vectorizer = CountVectorizer(stop_words='english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[0 1 1 0 0 1 0 1]
 [0 1 1 1 1 0 0 0]
 [1 0 0 0 0 0 1 0]]
{'unc': 7, 'played': 5, 'duke': 2, 'basketball': 1, 'lost': 4, 'game': 3, 'ate': 0, 'sandwich': 6}


In [21]:
# Stemming and lemmatization

In [22]:
corpus = [
'He ate the sandwiches',
'Every sandwich was eaten by him'
]

In [23]:
vectorizer =CountVectorizer(binary=True, stop_words='english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[1 0 0 1]
 [0 1 1 0]]
{'ate': 0, 'sandwiches': 3, 'sandwich': 2, 'eaten': 1}


In [24]:
corpus = [
'I am gathering ingredients for the sandwich.',
'There were many wizards at the gathering.'
]

In [25]:
from nltk.stem.wordnet import WordNetLemmatizer

In [26]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('gathering', 'v'))
print(lemmatizer.lemmatize('gathering', 'n'))

gather
gathering


In [27]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem('gathering'))

gather


In [28]:
# Extending bag-of-words with tf-idf weights

In [29]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
corpus = ['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich']

In [31]:
vectorizer = CountVectorizer(stop_words='english')
frequencies = np.array(vectorizer.fit_transform(corpus).todense())[0]
print(frequencies)
print('Token indices %s' % vectorizer.vocabulary_)

[2 1 3 1 1]
Token indices {'dog': 1, 'ate': 0, 'sandwich': 2, 'wizard': 4, 'transfigured': 3}


In [32]:
for token, index in vectorizer.vocabulary_.items():
    print('The token "%s" appears %s times' % (token,frequencies[index]))

The token "dog" appears 1 times
The token "ate" appears 2 times
The token "sandwich" appears 3 times
The token "wizard" appears 1 times
The token "transfigured" appears 1 times


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
corpus = [
'The dog ate a sandwich and I ate a sandwich',
'The wizard transfigured a sandwich'
]

In [35]:
vectorizer = TfidfVectorizer(stop_words='english')
print(vectorizer.fit_transform(corpus).todense())

[[0.75458397 0.37729199 0.53689271 0.         0.        ]
 [0.         0.         0.44943642 0.6316672  0.6316672 ]]


In [36]:
# Extracting features from pixel intensities

In [37]:
from sklearn import datasets

In [38]:
digits = datasets.load_digits()

In [39]:
print('Digit: %s' % digits.target[0])

Digit: 0


In [40]:
print(digits.images[0])

[[ 0.  0.  5. 13.  9.  1.  0.  0.]
 [ 0.  0. 13. 15. 10. 15.  5.  0.]
 [ 0.  3. 15.  2.  0. 11.  8.  0.]
 [ 0.  4. 12.  0.  0.  8.  8.  0.]
 [ 0.  5.  8.  0.  0.  9.  8.  0.]
 [ 0.  4. 11.  0.  1. 12.  7.  0.]
 [ 0.  2. 14.  5. 10. 12.  0.  0.]
 [ 0.  0.  6. 13. 10.  0.  0.  0.]]


In [41]:
print('Feature vector:\n %s' % digits.images[0].reshape(-1, 64))

Feature vector:
 [[ 0.  0.  5. 13.  9.  1.  0.  0.  0.  0. 13. 15. 10. 15.  5.  0.  0.  3.
  15.  2.  0. 11.  8.  0.  0.  4. 12.  0.  0.  8.  8.  0.  0.  5.  8.  0.
   0.  9.  8.  0.  0.  4. 11.  0.  1. 12.  7.  0.  0.  2. 14.  5. 10. 12.
   0.  0.  0.  0.  6. 13. 10.  0.  0.  0.]]
