In [1]:
# =========================================================================
# EXAMPLE FROM -- https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92
import pandas as pd
import numpy as np
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
from gensim.models import Word2Vec
# https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
from sklearn.manifold import TSNE
import matplotlib as mpl
mpl.use("TkAgg")
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
cars_df = pd.read_csv("./data/cars/data.csv")
print(cars_df.head())

  Make       Model  Year             Engine Fuel Type  Engine HP  \
0  BMW  1 Series M  2011  premium unleaded (required)      335.0   
1  BMW    1 Series  2011  premium unleaded (required)      300.0   
2  BMW    1 Series  2011  premium unleaded (required)      300.0   
3  BMW    1 Series  2011  premium unleaded (required)      230.0   
4  BMW    1 Series  2011  premium unleaded (required)      230.0   

   Engine Cylinders Transmission Type     Driven_Wheels  Number of Doors  \
0               6.0            MANUAL  rear wheel drive              2.0   
1               6.0            MANUAL  rear wheel drive              2.0   
2               6.0            MANUAL  rear wheel drive              2.0   
3               6.0            MANUAL  rear wheel drive              2.0   
4               6.0            MANUAL  rear wheel drive              2.0   

                         Market Category Vehicle Size Vehicle Style  \
0  Factory Tuner,Luxury,High-Performance      Compact         C

In [3]:
# preprocessing
cars_df['Maker_Model']= cars_df['Make']+ " " + cars_df['Model']

# Select features from original dataset to form a new dataframe
df1 = cars_df[['Engine Fuel Type','Transmission Type','Driven_Wheels','Market Category','Vehicle Size', 'Vehicle Style', 'Maker_Model']]

# For each row, combine all the columns into one column
df2 = df1.apply(lambda x: ','.join(x.astype(str)), axis=1)

# Store them in a pandas dataframe
df_clean = pd.DataFrame({'clean': df2})

# Create the list of list format of the custom corpus for gensim modeling 
sent = [row.split(',') for row in df_clean['clean']]

# show the example of list of list format of the custom corpus for gensim modeling 
print(sent[:2])

[['premium unleaded (required)', 'MANUAL', 'rear wheel drive', 'Factory Tuner', 'Luxury', 'High-Performance', 'Compact', 'Coupe', 'BMW 1 Series M'], ['premium unleaded (required)', 'MANUAL', 'rear wheel drive', 'Luxury', 'Performance', 'Compact', 'Convertible', 'BMW 1 Series']]


In [4]:
# We can train the genism word2vec model with our own custom corpus as following:
# sg: The training algorithm, either CBOW(0) or skip gram(1). The default training algorithm is CBOW.
model = Word2Vec(sent, min_count=1,size= 50,workers=3, window =3, sg = 1)

In [5]:
print(model['Toyota Camry'])
print(model['BMW 1 Series M'])

[-1.29781663e-02 -1.02103718e-01  1.80753286e-03  1.51702585e-02
  2.88857728e-01  1.88896850e-01  3.33204679e-02 -3.05914968e-01
  9.96626839e-02 -1.98564708e-01  2.88734157e-02 -6.58230949e-03
 -7.07473606e-02  2.66958147e-01 -9.41700935e-02  2.12716847e-03
  9.76487324e-02 -1.13655008e-01  8.78906474e-02 -2.19511688e-01
  7.51455128e-02 -1.96342304e-01  1.54801518e-01  8.25967342e-02
  2.77260020e-02  1.44463271e-01  6.89573810e-02 -2.35848054e-01
 -2.16182787e-04 -8.44315737e-02 -4.31028642e-02  5.13898842e-02
  8.93378630e-02  8.22694302e-02  5.23096509e-02  5.94185367e-02
  1.91950440e-01 -3.78317125e-02  1.16712198e-01  1.89329356e-01
  2.39873677e-01 -7.46561289e-02 -4.00127377e-03  1.44083649e-01
  7.43657351e-02  8.89398530e-02 -6.53699487e-02  7.09899962e-02
 -1.05344616e-01  1.45446556e-02]
[ 0.00290547 -0.01530276 -0.00397836  0.0088531   0.04609529  0.03686055
  0.01683452 -0.04237878  0.00708837 -0.03303989  0.01541901  0.00848634
 -0.02950245  0.04444623 -0.0005655  -0.

In [6]:
print(model.similarity('Porsche 718 Cayman', 'Nissan Van'))
print(model.similarity('Porsche 718 Cayman', 'Mercedes-Benz SLK-Class'))
print(model.most_similar('Mercedes-Benz SLK-Class')[:5])

0.87314415
0.9877105
[('Toyota MR2 Spyder', 0.9953170418739319), ('Mercedes-Benz SL-Class', 0.9949602484703064), ('Honda S2000', 0.9938086271286011), ('Chrysler Crossfire', 0.9937037825584412), ('BMW 1 Series', 0.993675947189331)]


In [7]:
def cosine_distance (model, word,target_list , num) :
    cosine_dict ={}
    word_list = []
    a = model[word]
    for item in target_list :
        if item != word :
            b = model [item]
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            cosine_dict[item] = cos_sim
    dist_sort=sorted(cosine_dict.items(), key=lambda dist: dist[1],reverse = True) ## in Descedning order 
    for item in dist_sort:
        word_list.append((item[0], item[1]))
    return word_list[0:num]

In [8]:
# only get the unique Maker_Models
Maker_Model = list(cars_df.Maker_Model.unique()) 

# Show the most similar Mercedes-Benz SLK-Class by cosine distance
cd = cosine_distance (model,'Mercedes-Benz SLK-Class',Maker_Model,5)
print(cd)

[('Toyota MR2 Spyder', 0.99531704), ('Mercedes-Benz SL-Class', 0.9949602), ('Honda S2000', 0.99380875), ('Chrysler Crossfire', 0.9937039), ('BMW 1 Series', 0.993676)]


In [9]:
def display_closestwords_tsnescatterplot(model, word, size):
    arr = np.empty((0,size), dtype='f')
    word_labels = [word]

    close_words = model.similar_by_word(word)
    arr = np.append(arr, np.array([model[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)
    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    plt.scatter(x_coords, y_coords)
    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()

display_closestwords_tsnescatterplot(model, 'Porsche 718 Cayman', 50) 