In [1]:
import re
import string
import pandas as pd
from math import log
from functools import reduce
import numpy as np

In [2]:
measurements = [{'city': 'Dubai', 'temperature': 33.0},
                {'city': 'London', 'temperature':12.0},
               {'city': 'San Francisco', 'temperature': 18.0},
               {'city': 'India', 'temperature': 28.0}
               ]
measurements

[{'city': 'Dubai', 'temperature': 33.0},
 {'city': 'London', 'temperature': 12.0},
 {'city': 'San Francisco', 'temperature': 18.0},
 {'city': 'India', 'temperature': 28.0}]

In [3]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
vec

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
               sparse=True)

In [4]:
vec.fit_transform(measurements).toarray()

array([[ 1.,  0.,  0.,  0., 33.],
       [ 0.,  0.,  1.,  0., 12.],
       [ 0.,  0.,  0.,  1., 18.],
       [ 0.,  1.,  0.,  0., 28.]])

In [5]:
vec.get_feature_names()

['city=Dubai',
 'city=India',
 'city=London',
 'city=San Francisco',
 'temperature']

In [6]:
vec.feature_names_

['city=Dubai',
 'city=India',
 'city=London',
 'city=San Francisco',
 'temperature']

In [7]:
vec.vocabulary_

{'city=Dubai': 0,
 'temperature': 4,
 'city=London': 2,
 'city=San Francisco': 3,
 'city=India': 1}

In [8]:
df = pd.DataFrame(measurements)
df

Unnamed: 0,city,temperature
0,Dubai,33.0
1,London,12.0
2,San Francisco,18.0
3,India,28.0


In [9]:
pd.get_dummies(df)

Unnamed: 0,temperature,city_Dubai,city_India,city_London,city_San Francisco
0,33.0,1,0,0,0
1,12.0,0,0,1,0
2,18.0,0,0,0,1
3,28.0,0,1,0,0


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
pos_windows = [
    {
        'word-2':'the',
        'pos-2': 'DT',
        'world-1': 'Cat',
        'pos-1':'NN',
        'word+1': 'on',
        'pos+1': 'pp'
    }
]

In [14]:
vec = DictVectorizer()
vec

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
               sparse=True)

In [16]:
pos_vectorization = vec.fit_transform(pos_windows)
pos_vectorization.toarray()

array([[1., 1., 1., 1., 1., 1.]])

In [17]:
vec.get_feature_names()

['pos+1=pp', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-2=the', 'world-1=Cat']

In [18]:
vec.vocabulary_

{'word-2=the': 4,
 'pos-2=DT': 2,
 'world-1=Cat': 5,
 'pos-1=NN': 1,
 'word+1=on': 3,
 'pos+1=pp': 0}

In [19]:
ord('-')

45