### Data transformation and representation
- Numerical data: 
    - normalization
    - standardization
- Categorical data: 
    - one-hot encoding
    - ordinal encoding
- Text: 
    - tokenization 
    - stemming and lemmatization 
    - CountVectorizer/TfidfVectorizer
    - Embedding

### Numerical Data

#### normalization with MinMax scaler: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

In [1]:
import sklearn
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = [[-1, 2], 
        [-0.5, 6], 
        [0, 10], 
        [1, 18]]

In [3]:
scaler = MinMaxScaler(feature_range=(0,1)) # feature_range=(0,1) by default
scaler.fit(data)

In [4]:
scaler.data_min_, scaler.data_max_

(array([-1.,  2.]), array([ 1., 18.]))

In [5]:
# transform data from original range to [0,1]
scaler.transform(data)

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [6]:
scaler.transform([[2, 2]])

array([[1.5, 0. ]])

#### StandardScaler: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
- **self-practice**

In [7]:
from sklearn.preprocessing import StandardScaler

# Create and fit StandardScaler
std_scaler = StandardScaler()
std_scaler.fit(data)

# Transform data 
print("Transformed data:")
print(std_scaler.transform(data))

print("\nTransform new data point:")
print(std_scaler.transform([[2, 2]]))


Transformed data:
[[-1.18321596 -1.18321596]
 [-0.50709255 -0.50709255]
 [ 0.16903085  0.16903085]
 [ 1.52127766  1.52127766]]

Transform new data point:
[[ 2.87352447 -1.18321596]]


### Categorical data

#### One-hot encoder: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [9]:
df_gender = pd.DataFrame([['Male'], ['Female'], ['Female']],columns=['gender'])
df_gender

Unnamed: 0,gender
0,Male
1,Female
2,Female


In [10]:
oh_enc = OneHotEncoder(handle_unknown='ignore')

In [11]:
oh_enc.fit(df_gender[['gender']])

In [12]:
oh_enc.transform(df_gender[['gender']]).toarray()

array([[0., 1.],
       [1., 0.],
       [1., 0.]])

In [13]:
oh_enc.categories_

[array(['Female', 'Male'], dtype=object)]

In [14]:
oh_enc.get_feature_names_out(['gender'])

array(['gender_Female', 'gender_Male'], dtype=object)

In [15]:
pd.DataFrame(oh_enc.transform(df_gender[['gender']]).toarray().astype(int),columns=oh_enc.categories_)

Unnamed: 0,Female,Male
0,0,1
1,1,0
2,1,0


In [16]:
oh_enc.transform([['Female'], ['Male']]).toarray()



array([[1., 0.],
       [0., 1.]])

In [17]:
oh_enc.inverse_transform([[0, 1], [1, 0]])

array([['Male'],
       ['Female']], dtype=object)

#### Ordinal encoder: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
- **self-practice**

In [18]:
from sklearn.preprocessing import OrdinalEncoder

# Create sample data with ordinal categories
df_education = pd.DataFrame({
    'education': ['High School', 'Bachelor', 'Master', 'PhD']
})

# Create and fit ordinal encoder
ord_enc = OrdinalEncoder()
ord_enc.fit(df_education[['education']])

# Transform the data
encoded_education = ord_enc.transform(df_education[['education']])
print("Encoded education levels:")
print(encoded_education)

# Inverse transform to get original categories
print("\nOriginal categories:")
print(ord_enc.inverse_transform(encoded_education))


Encoded education levels:
[[1.]
 [0.]
 [2.]
 [3.]]

Original categories:
[['High School']
 ['Bachelor']
 ['Master']
 ['PhD']]


### Text data
- Natural Language Toolkit (nltk): https://www.nltk.org/

#### tokenization

In [19]:
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize

In [29]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /Users/rezek_zhu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rezek_zhu/nltk_data...


True

In [23]:
nltk.sent_tokenize("At eight o'clock on Thursday morning. Arthur didn't feel very good.")

["At eight o'clock on Thursday morning.", "Arthur didn't feel very good."]

In [24]:
nltk.word_tokenize("At eight o'clock on Thursday morning. Arthur didn't feel very good.")

['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 '.',
 'Arthur',
 'did',
 "n't",
 'feel',
 'very',
 'good',
 '.']

#### stemming/lemmatization

In [26]:
# stemming: rule-based methods, chop off inflections
from nltk.stem import PorterStemmer
ps = PorterStemmer()
word = ("leaves") 
ps.stem(word)

'leav'

In [32]:
# lemmatization: lexical-based knowledge, get the base form
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("leaves")

'leaf'

#### CountVectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

In [34]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [35]:
vectorizer = CountVectorizer() 
# self-practice: explore different parameter settings for CountVectorizer: 
#  E.g., lowercase, stop_words, ngram_range(), max_df, min_df, binary, use_idf, smooth_idf

In [36]:
X = vectorizer.fit_transform(corpus)
# explore other parameter settings: lowercase, stop_words, ngram_range(), max_df, min_df, binary
X.shape

(4, 9)

In [37]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [38]:
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0,1,1,1,0,0,1,0,1
1,0,2,0,1,0,1,1,0,1
2,1,0,0,1,1,0,1,1,1
3,0,1,1,1,0,0,1,0,1


#### TfidfVectorizer:  https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
- **self-practice**

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

# Get feature names and create DataFrame
pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


#### Word embedding: word2vec

In [40]:
import numpy as np

In [42]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

[=-------------------------------------------------] 3.1% 51.8/1662.8MB downloaded

KeyboardInterrupt: 

In [54]:
np.round(wv['king'] - wv['man'],2)

array([-0.2 , -0.1 , -0.03,  0.22, -0.12,  0.01,  0.31, -0.21, -0.09,
        0.36, -0.27, -0.05, -0.09,  0.13, -0.07, -0.21,  0.13, -0.05,
        0.02,  0.23,  0.27,  0.11,  0.03,  0.41,  0.04,  0.01, -0.17,
       -0.2 , -0.03,  0.01, -0.06, -0.07,  0.19,  0.42, -0.23, -0.17,
       -0.21,  0.13, -0.  , -0.15,  0.11, -0.19,  0.07,  0.15,  0.12,
        0.11, -0.03,  0.02, -0.  , -0.02, -0.03,  0.03, -0.32,  0.2 ,
       -0.24,  0.09, -0.07, -0.01, -0.04, -0.01,  0.01,  0.14,  0.05,
        0.15,  0.01, -0.18, -0.07, -0.02, -0.25,  0.31,  0.04, -0.09,
        0.1 ,  0.05,  0.09, -0.11, -0.12,  0.11,  0.15, -0.08, -0.16,
        0.18, -0.11,  0.17,  0.28, -0.06, -0.01,  0.03, -0.02,  0.09,
        0.38, -0.11, -0.19, -0.14, -0.09,  0.06,  0.11, -0.07,  0.04,
       -0.08,  0.17,  0.21, -0.13, -0.24, -0.51,  0.31, -0.64, -0.01,
        0.14,  0.13,  0.24,  0.22,  0.01, -0.12,  0.33, -0.23,  0.08,
       -0.18,  0.12,  0.07,  0.12,  0.13,  0.06, -0.06,  0.19, -0.02,
        0.05, -0.02,

In [55]:
np.round(wv['queen'] - wv['woman'], 2)

array([-0.24, -0.07,  0.03,  0.23,  0.01,  0.02,  0.04, -0.24, -0.31,
        0.01,  0.02, -0.04,  0.17,  0.12,  0.16,  0.02,  0.26,  0.01,
        0.  ,  0.12,  0.22,  0.18, -0.18,  0.29,  0.06,  0.18, -0.14,
       -0.17, -0.16,  0.04, -0.3 , -0.12,  0.14,  0.32, -0.24, -0.25,
       -0.21,  0.31,  0.  , -0.02,  0.26, -0.39,  0.11, -0.  ,  0.05,
       -0.02,  0.07,  0.11,  0.19,  0.12, -0.07,  0.1 , -0.  , -0.07,
       -0.09,  0.01, -0.25, -0.1 , -0.09, -0.11, -0.16,  0.1 , -0.  ,
        0.03,  0.13, -0.3 , -0.11, -0.03, -0.38,  0.19, -0.02, -0.  ,
        0.03,  0.02, -0.08, -0.1 ,  0.11,  0.06, -0.06, -0.21, -0.31,
        0.12,  0.1 ,  0.04,  0.52, -0.11, -0.09,  0.06,  0.08,  0.14,
        0.59, -0.03, -0.03, -0.04,  0.07,  0.03,  0.01, -0.08, -0.03,
        0.03,  0.15,  0.13,  0.13, -0.03, -0.62,  0.22, -0.65,  0.11,
       -0.12,  0.04,  0.26,  0.1 , -0.03,  0.07,  0.36,  0.09,  0.25,
       -0.16,  0.12, -0.07,  0.04, -0.14,  0.19, -0.19,  0.24, -0.14,
       -0.14,  0.13,

In [56]:
sklearn.metrics.pairwise.cosine_similarity([wv['king'] - wv['man']], [wv['queen'] - wv['woman']])

array([[0.7580351]], dtype=float32)