# Data Vectorization

## Loading the packages

In [None]:
# Verify which python environment we are running the notebook on
# !pyenv version
# !pip install pandas
# !pip install numpy
# !pip install matplotlib
# !pip install seaborn
# !pip install nltk
#%pip install --upgrade pip

In [1]:
# Import packages
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json

## Loading the data from the right data file into a dataset

In [2]:
# Retrieve the data from csv unzipped files into two dataframes
#wine1_df=pd.read_csv("~/code/sabrinaauger/wino/raw_data/winemag-data-130k-v2.csv")
wine1_df=pd.read_csv("~/code/sabrinaauger/wino/data/raw_data/winemag-data-130k-v2.csv")
#wine2_df=pd.read_csv("~/code/sabrinaauger/wino/raw_data/winemag-data_first150k.csv")
wine2_df=pd.read_csv("~/code/sabrinaauger/wino/data/raw_data/winemag-data_first150k.csv")

In [3]:
df = wine1_df
df.head(2)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos


## Tokenizing and Vectorizing the descriptions

In [4]:
# Importing
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

df['description_tokenized'] = df['description'].apply(word_tokenize)
df.head(2)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,description_tokenized
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,"[Aromas, include, tropical, fruit, ,, broom, ,..."
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,"[This, is, ripe, and, fruity, ,, a, wine, that..."


In [5]:
# Word2Vec from https://code.google.com/archive/p/word2vec/
model = Word2Vec(sentences=df['description_tokenized'], vector_size=100, window=5, min_count=1, workers=4)


In [7]:
#
def get_sentence_vector(sentence, model):
    words_in_vocab = [word for word in sentence if word in model.wv.key_to_index]

    # If no words are in the vocabulary, then it becomes 0
    if not words_in_vocab:
        return [0] * model.vector_size

    # Calculate the mean vector for the whole sentence
    vector = sum(model.wv[word] for word in words_in_vocab) / len(words_in_vocab)
    return vector

description_vectorized_df = pd.DataFrame()
description_vectorized_df['description'] = df['description_tokenized'].apply(lambda x: get_sentence_vector(x, model))

description_vectorized_df.head(3)

Unnamed: 0,description
0,"[-0.18870287, -0.7581146, -0.49562988, -0.0366..."
1,"[0.48500478, -0.85762995, -0.1668026, 1.003598..."
2,"[0.11007728, -0.7666281, -0.5415931, 0.4127745..."


In [None]:
#wine1_df.head(2)


## Sorting the others features to then vectorize them depending on their type

In [8]:
# Sorting which feature is what
y = ['designation']
numerical_features_list = ['points', 'price']
categorical_features_list = ['country', 'province', 'region_1', 'region_2', 'variety', 'winery']
text_features_list = ['taster_name', 'taster_twitter_handle', 'title']
# country	description	designation	points	price	province	region_1	region_2	taster_name	taster_twitter_handle	title	variety	winery


## Vectorizing numerical features

In [9]:
# Vectorizing numeric features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numeric_scaled = scaler.fit_transform(df[numerical_features_list])

#wine1_df_scaled = pd.concat([wine1_df, pd.DataFrame(numeric_scaled, columns=['points_scaled', 'price_scaled'])], axis=1)
#wine1_df_scaled.head(2)

numerical_features_vectorized_df = pd.DataFrame(numeric_scaled, columns=['points', 'price'])
numerical_features_vectorized_df.head(2)

Unnamed: 0,points_scaled,price_scaled
0,-0.476076,
1,-0.476076,-0.496401


## Vectorizing categorical features

In [10]:
# Vectorizing categorical features
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
category_encoded = encoder.fit_transform(df[categorical_features_list]).toarray()

# wine1_df_scaled = pd.concat([wine1_df, pd.DataFrame(category_encoded, columns=encoder.get_feature_names_out(categorical_features_list))], axis=1)
# wine1_df_scaled.head(2)

categorical_features_vectorized_df = pd.DataFrame(category_encoded, columns=encoder.get_feature_names_out(categorical_features_list))
categorical_features_vectorized_df.head(2)


Unnamed: 0,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Bosnia and Herzegovina,country_Brazil,country_Bulgaria,country_Canada,country_Chile,country_China,...,winery_Écluse,winery_Élevée Winegrowers,winery_Éric & Jöel Durand,winery_Ïl Macchione,winery_Ñandú,winery_Órale,winery_Öko,winery_Ökonomierat Rebholz,winery_àMaurice,winery_Štoka
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Vectorizing text features

In [14]:
text_features_list

['taster_name', 'taster_twitter_handle', 'title']

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

text_features_vectorized_df = pd.DataFrame()

vectorizer = TfidfVectorizer()
# Vectorize each text feature and concatenate the results
for feature in text_features_list:
    # Handle NaN values by replacing them with an empty string
    df[feature].fillna('', inplace=True)
    # Fit and transform the current text feature
    feature_vectorized = vectorizer.fit_transform(df[feature])
    # Create a DataFrame with the vectorized text feature and concatenate it to the original DataFrame
    text_features_vectorized_df = pd.concat([text_features_vectorized_df, pd.DataFrame(feature_vectorized.toarray(), columns=vectorizer.get_feature_names_out([feature]))], axis=1)

text_features_vectorized_df.head(2)


: 

## Merging everything in a new fully vectorized dataframe