# Examples of Session 2: Data Preprocessing

### E1. getting familiar with *pandas* library

Review the following 10-minute introduction to *pandas*:

http://pandas.pydata.org/pandas-docs/stable/10min.html

### E2. imputing missing values

In [1]:
import numpy as np
from sklearn.preprocessing import Imputer

# the following data has missing values, indicated by 'nan' ... 'Not A Number'
X = [[np.nan, 1], [7, np.nan], [6, 8]]

# initialize the imputer object: axis=0 means imputation along columns, 
# asking it to replace missing values with mean of each column
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

# fit and transform to impute missing values 
#(could be combined both using fit_transform function)
imp.fit(X)
print(imp.transform(X))                           

[[ 6.5  1. ]
 [ 7.   4.5]
 [ 6.   8. ]]


### E3. standardization

In [7]:
from sklearn import preprocessing

# raw data (not yet standardized)
X = np.array([[ 1., -2.,  2.],[ 3.,  0.,  1.],[ 0.,  4., -2.]])
print(X)

[[ 1. -2.  2.]
 [ 3.  0.  1.]
 [ 0.  4. -2.]]


In [8]:
# standardize data to have mean of 0 and standard deviation of 1
X_scaled = preprocessing.scale(X)
print (X_scaled)

[[-0.26726124 -1.06904497  0.98058068]
 [ 1.33630621 -0.26726124  0.39223227]
 [-1.06904497  1.33630621 -1.37281295]]


In [9]:
# check mean
X_scaled.mean(axis=0)

array([  7.40148683e-17,   7.40148683e-17,   0.00000000e+00])

In [10]:
# check standard deviation
X_scaled.std(axis=0)

array([ 1.,  1.,  1.])

In [11]:
# non-standard data
X = [[ 1., -2.,  2.],[ 3.,  0.,  1.],[ 0.,  4., -2.]]

# making data standard by limiting its range between a min and a max value (0 and 1 by default)
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X)
X_train_minmax

array([[ 0.33333333,  0.        ,  1.        ],
       [ 1.        ,  0.33333333,  0.75      ],
       [ 0.        ,  1.        ,  0.        ]])

### E4. normalization

In [6]:
from sklearn import preprocessing

# data for which the L2 norm is not equal to 1
X = [[ 1., -2.,  2.],[ 3.,  0.,  1.],[ 0.,  4., -2.]]

# transforming data such that the L2 norm is then equal to 1
X_normalized = preprocessing.normalize(X, norm='l2')
X_normalized             

array([[ 0.33333333, -0.66666667,  0.66666667],
       [ 0.9486833 ,  0.        ,  0.31622777],
       [ 0.        ,  0.89442719, -0.4472136 ]])

### E5. encoding categorical features

In [1]:
from sklearn import preprocessing

# categorical data with no ordering semantics
X = [[1, 0, 3], [0, 1, 0], [0, 0, 1], [1, 2, 2]]

# using one hot encoder to encode new categorical samples using a binary array
# it encodes a categorical feature with m different categories 
# into m binary features where only one of them is active for each sample
enc = preprocessing.OneHotEncoder()
enc.fit(X)  

# item representations: item 1 -> first two bits, item 2 -> next 3 bits, item 3 -> last 4 bits
print(enc.transform([[0, 1, 3]]).toarray())

[[ 1.  0.  0.  1.  0.  0.  0.  0.  1.]]


### E6. text feature extraction using gensim library [in class]

In [2]:
# taken from gensim documentations
# modified for pedagigical purposes by Omid Shahmirzadi

# you need to install gensim for this example:
from gensim import corpora, models

# our corpus to be transformed into its corresponding bag of words representation
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
        for document in documents]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

# pretty printer
from pprint import pprint
pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [3]:
dictionary = corpora.Dictionary(texts)
print(dictionary)

Dictionary(12 unique tokens: ['system', 'human', 'user', 'survey', 'computer']...)


In [4]:
print(dictionary.token2id)

{'system': 3, 'human': 0, 'user': 4, 'survey': 6, 'computer': 2, 'eps': 8, 'response': 5, 'trees': 9, 'time': 7, 'interface': 1, 'minors': 11, 'graph': 10}


In [5]:
# convert a new document in bag of words representation
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored

[(0, 1), (2, 1)]


In [6]:
# compare all documents to bag of words representation with their corresponding tf
corpus = [dictionary.doc2bow(text) for text in texts]
for c in corpus:
    print(c)

[(0, 1), (1, 1), (2, 1)]
[(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(1, 1), (3, 1), (4, 1), (8, 1)]
[(0, 1), (3, 2), (8, 1)]
[(4, 1), (5, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(6, 1), (10, 1), (11, 1)]


In [7]:
# convert the above corpus to their corresponding tf-idf representation 
# (l2 normalization is done implicitely)
tfidf = models.TfidfModel(corpus)
for c in corpus:
    print(tfidf[c])

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(2, 0.44424552527467465), (3, 0.3244870206138554), (4, 0.3244870206138554), (5, 0.44424552527467465), (6, 0.44424552527467465), (7, 0.44424552527467465)]
[(1, 0.5710059809418182), (3, 0.4170757362022777), (4, 0.4170757362022777), (8, 0.5710059809418182)]
[(0, 0.49182558987264147), (3, 0.7184811607083769), (8, 0.49182558987264147)]
[(4, 0.45889394536615247), (5, 0.6282580468670046), (7, 0.6282580468670046)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(6, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]
