## 1.Example Vehicle Information

### Install Dependencies and Libraries

In [1]:
#!pip install --user trafilatura #Trafilatura is a Python package and command-line tool designed to gather text on the Web. 
#It includes discovery, extraction and text processing components.

#!pip install --user tqdm #progress bar

#!pip install --user plotly 
#graphing library makes interactive, publication-quality graphs

#!pip install --user datapane # app development platform which gives you everything you need to build internal data analytics products

#!pip install --user scikit-learn

#!pip install --user gensim 
#Python library for topic modelling, document indexing and similarity retrieval with large corpora

In [2]:
# Text manipulation libraries
import re
import string
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords') <-- we run this command to download the stopwords in the project
# nltk.download('punkt') <-- essential for tokenization

In [3]:
def preprocess_text(text: str, remove_stopwords: bool) -> str:
    """Function that cleans the input text by going to:
    - remove links
    - remove special characters
    - remove numbers
    - remove stopwords
    - convert to lowercase
    - remove excessive white spaces
    Arguments:
        text (str): text to clean
        remove_stopwords (bool): whether to remove stopwords
    Returns:
        str: cleaned text
    """
    # remove links
    text = re.sub(r"http\S+", "", text)
    # remove numbers and special characters
    text = re.sub("[^A-Za-z]+", " ", text)
    # remove stopwords
    if remove_stopwords:
        # 1. create tokens
        tokens = nltk.word_tokenize(text)
        # 2. check if it's a stopword
        tokens = [w.lower().strip() for w in tokens if not w.lower() in stopwords.words("italian")]
        # return a list of cleaned tokens
        return tokens

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('vehicle_data.csv')
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


### Data Preprocessing

#### Genism word2vec requires that a format of ‘list of lists’ for training where every document is contained in a list and every list contains lists of tokens of that document

In [6]:
#a. Create a new column for Make Model
df['Maker_Model']= df['Make']+ " " + df['Model']

In [7]:
#b. Generate a format of ‘ list of lists’ for each Make Model with the following features: Engine Fuel Type, 
#Transmission Type, Driven_Wheels, Market Category, Vehicle Size, Vehicle Style.
# Select features from original dataset to form a new dataframe 
df1 = df[['Engine Fuel Type','Transmission Type','Driven_Wheels','Market Category','Vehicle Size', 'Vehicle Style', 'Maker_Model']]
# For each row, combine all the columns into one column
df2 = df1.apply(lambda x: ','.join(x.astype(str)), axis=1)
# Store them in a pandas dataframe
df_clean = pd.DataFrame({'clean': df2})
# Create the list of list format of the custom corpus for gensim modeling 
sent = [row.split(',') for row in df_clean['clean']]
# show the example of list of list format of the custom corpus for gensim modeling 
sent[0:2]

[['premium unleaded (required)',
  'MANUAL',
  'rear wheel drive',
  'Factory Tuner',
  'Luxury',
  'High-Performance',
  'Compact',
  'Coupe',
  'BMW 1 Series M'],
 ['premium unleaded (required)',
  'MANUAL',
  'rear wheel drive',
  'Luxury',
  'Performance',
  'Compact',
  'Convertible',
  'BMW 1 Series']]

### Genism word2vec Model Training

In [8]:
from gensim.models import Word2Vec

In [9]:
model = Word2Vec(sent, min_count=1, vector_size= 50, workers=3, window=3, sg=1)

size: The number of dimensions of the embeddings and the default is 100.

window: The maximum distance between a target word and words around the target word. The default window is 5.

min_count: The minimum count of words to consider when training the model; words with occurrence less than this count will be ignored. The default for min_count is 5.

workers: The number of partitions during training and the default workers is 3.

sg: The training algorithm, either CBOW(0) or skip gram(1). The default training algorithm is CBOW.

In [10]:
#we can obtain the word embedding directly from the training model as following
model.wv['BMW 1 Series']

array([-0.06447092,  0.03368214,  0.08907529, -0.00988199, -0.05456775,
       -0.29894567, -0.09596105,  0.31284606, -0.03423645, -0.16342698,
        0.16276276,  0.01470755, -0.0644301 , -0.02556136,  0.01488294,
        0.19470763,  0.18321532,  0.10991536,  0.01668741, -0.37514982,
       -0.03045461,  0.02985105,  0.24281603, -0.0188671 ,  0.18125793,
        0.03453473, -0.23241971,  0.3380392 ,  0.06438169, -0.17859639,
       -0.07001738,  0.12877147, -0.02074502,  0.17828555,  0.03972047,
       -0.07479623,  0.14473233,  0.02136263,  0.1473942 ,  0.16656908,
        0.07943633,  0.03851018, -0.25410762,  0.07089072,  0.21949013,
        0.02474001, -0.12547262, -0.03874231,  0.05473589,  0.09995588],
      dtype=float32)

In [11]:
model.wv['Toyota Camry']

array([ 0.02073421,  0.1498057 ,  0.01030488, -0.12890576, -0.06331267,
       -0.21266186,  0.00908569,  0.2509053 , -0.1551299 , -0.0699002 ,
       -0.01457829,  0.01222638,  0.0484973 , -0.06282432, -0.05556518,
        0.16408917,  0.14385964,  0.311455  , -0.13700299, -0.2993097 ,
       -0.01819991, -0.04954226,  0.26744488,  0.10026886,  0.1775645 ,
       -0.02798099, -0.02493814,  0.35560793, -0.05148494,  0.01249023,
        0.02705731,  0.03829135,  0.0582683 , -0.08130286,  0.07058342,
       -0.15130651,  0.19558574,  0.02692135,  0.05195242,  0.04589207,
        0.08190273, -0.10725226, -0.19352484,  0.08257486,  0.3428256 ,
        0.07830502, -0.02105589, -0.13325578,  0.02691586,  0.00856031],
      dtype=float32)

### Compute Similarities

In [12]:
model.wv.similarity('BMW 1 Series','Toyota Camry')

0.74666965

In [13]:
model.wv.similarity('BMW 1 Series','BMW 1 Series M')

0.8981883

In [14]:
model.wv.most_similar('BMW 1 Series')

[('Ferrari F430', 0.9848706722259521),
 ('Nissan 350Z', 0.9819698929786682),
 ('BMW Z4', 0.981057345867157),
 ('Mazda MX-5 Miata', 0.9805260300636292),
 ('Ferrari 575M', 0.9801815152168274),
 ('Pontiac Solstice', 0.9801721572875977),
 ('Lamborghini Murcielago', 0.9786002039909363),
 ('Mazda RX-8', 0.9761828780174255),
 ('Audi R8', 0.9754549860954285),
 ('Lotus Elise', 0.9746772646903992)]

In [15]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

In [16]:
def cosine_distance (model, word,target_list , num) :
    cosine_dict ={}
    word_list = []
    a = model.wv[word]
    for item in target_list :
        if item != word :
            b = model.wv[item]
            cos_sim = dot(a, b)/(norm(a)*norm(b))
            cosine_dict[item] = cos_sim
    dist_sort=sorted(cosine_dict.items(), key=lambda dist: dist[1],reverse = True) ## in Descedning order 
    for item in dist_sort:
        word_list.append((item[0], item[1]))
    return word_list[0:num]

In [17]:
Maker_Model = list(df.Maker_Model.unique()) 
# Show the most similar Mercedes-Benz SLK-Class by cosine distance 
cosine_distance (model,'Mercedes-Benz SLK-Class',Maker_Model,5)

[('Audi S3', 0.99044716),
 ('Pontiac Solstice', 0.9887829),
 ('Subaru BRZ', 0.9868054),
 ('Mercedes-Benz CLK-Class', 0.98652834),
 ('Aston Martin DB7', 0.9856698)]

## 2. Example Reading a Wikipedia Page

In [18]:
import urllib
import bs4 as bs

In [19]:
source = urllib.request.urlopen ('https://en.wikipedia.org/wiki/Fourth_Industrial_Revolution').read() #we have the source

In [20]:
soup = bs.BeautifulSoup(source,'lxml') #we have to parse the source and lxml parser

#### Preprocess

In [21]:
text = ""
for paragraph in soup.find_all('p'):
    text+=paragraph.text
#preprocess
text = re.sub (r'\[[0-9]*\]',' ', text)
text=re.sub(r'\s+',' ', text)
text=text.lower()
#text = re.sub(r'\W',' ',text)
text = re.sub(r'[@#$&%\*\(\)\>\<\?\'\":;\]\[-]',' ', text)
text=re.sub(r'\d',' ', text)
text = re.sub(r'\s+',' ',text)

In [22]:
text

' south asia middle east europe north america the fourth industrial revolution, ir, or industry . , conceptualises rapid change to technology, industries, and societal patterns and processes in the st century due to increasing interconnectivity and smart automation. the term was popularised in by klaus schwab, the world economic forum founder and executive chairman, and has since been used in numerous economic, political, and scientific articles in reference to the current era of emerging high technology. schwab asserts that the changes seen are more than just improvements to efficiency, but express a significant shift in industrial capitalism. a part of this phase of industrial change is the joining of technologies like artificial intelligence, gene editing, to advanced robotics that blur the lines between the physical, digital, and biological worlds. throughout this, fundamental shifts are taking place in how the global production and supply network operates through ongoing automatio

In [23]:
sentences = nltk.sent_tokenize(text)
sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
for i in range(len(sentences)):
    sentences[i]= [word for word in sentences[i] if word not in stopwords.words('english')]

In [24]:
len(sentences)

163

#### Training the model

In [25]:
model = Word2Vec(sentences, min_count =1)

In [26]:
#get vectors out of words
vectors = model.wv['industry']
vectors

array([-5.7364377e-04,  3.9552059e-03, -6.4074197e-03, -1.0413430e-03,
        7.7163349e-03,  6.4174593e-03, -2.8000311e-03,  4.5726472e-03,
       -8.5873343e-03,  5.7762219e-03, -4.9198954e-03, -4.2199288e-03,
        8.7718936e-03,  1.4403339e-03,  7.9389811e-03, -6.6882372e-03,
        5.6783138e-03,  9.1640223e-03, -8.5356310e-03, -6.5607023e-03,
       -6.3396916e-03, -4.3035890e-03, -3.1860634e-03, -8.7836357e-03,
        7.4041323e-03, -4.7310246e-03,  7.7854809e-03,  4.8420439e-03,
       -7.1041025e-03,  4.1699591e-03,  6.4404714e-03, -7.7084759e-03,
       -7.5340648e-03, -3.2839067e-03, -8.6726109e-03, -8.5105142e-04,
       -1.9935665e-04,  2.2338305e-03,  6.5691728e-04, -2.2325167e-03,
       -5.5763316e-03,  1.5293345e-03, -5.8348686e-04,  7.0068724e-03,
        4.5626103e-03,  4.2772144e-03,  9.0620707e-04, -2.5130769e-03,
       -3.8148181e-03, -5.4834189e-04,  1.9737973e-03, -3.2327722e-03,
       -7.1876701e-03, -7.8278696e-03, -9.9443626e-03, -5.2941791e-03,
      

In [27]:
len(vectors)

100

In [28]:
model.wv.most_similar('revolution')

[('wireless', 0.3168924152851105),
 ('also', 0.29391491413116455),
 ('mochon', 0.2913605272769928),
 ('economy', 0.27968281507492065),
 ('leaf', 0.2555803060531616),
 ('throughout', 0.25071874260902405),
 ('new', 0.250529021024704),
 ('affect', 0.24781370162963867),
 ('cyber', 0.24382221698760986),
 ('driven', 0.2421315610408783)]

### Improve the model, explore some of the pre-trained models

In [29]:
from gensim.models import KeyedVectors

In [30]:
filename = "GoogleNews-vectors-negative300.bin"

In [31]:
#Get the model- it is going to take a while with good RAM
model = KeyedVectors.load_word2vec_format(filename, binary=True) #bc it is binary file

In [32]:
model.most_similar('revolution')

[('revolutions', 0.7037398219108582),
 ('revolutionary', 0.6257899403572083),
 ('Revolution', 0.5847131013870239),
 ('revolutionaries', 0.5730563998222351),
 ('revolt', 0.5643020272254944),
 ('technological_revolutions', 0.547039270401001),
 ('counterrevolution', 0.5448992252349854),
 ('uprisings', 0.5448099970817566),
 ('bloodless_revolution', 0.5400627851486206),
 ('uprising', 0.531105637550354)]

In [33]:
model.most_similar('king')
#let's add different vecotrs and negation vectors, and find out
model.most_similar (positive = ['king', 'woman'], negative = ['man'])

[('queen', 0.7118193507194519),
 ('monarch', 0.6189674735069275),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.518113374710083),
 ('sultan', 0.5098593831062317),
 ('monarchy', 0.5087411999702454)]