In [57]:
#Example Vehicle Information

In [58]:
#Install Dependencies and Libraries

In [59]:
#!pip install --user trafilatura #Trafilatura is a Python package and command-line tool designed to gather text
#It includes discovery, extraction and text processing components.

#!pip install --user tqdm #progress bar

#!pip install --user plotly
#graphing libraray makes interactive, publication-quality graphs

#!pip install --user datapane #app development platform which gives you everything you need to build internal data

#!pip install --user scikit-learn

!pip install --user gensim
#Python library for topic modelling, document indexing and similarity retrieval with large corpora



In [60]:
#Text manipulation libraries
import re
import string
import nltk
import gensim
from nltk.corpus import stopwords
nltk.download('stopwords') #we run this command to download the stopwords in the project
nltk.download('punkt') #essential for tokenization

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xiaoqin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/xiaoqin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [61]:
def preprocess_text(text: str, remove_stopwords:bool) -> str:
    """Function that cleans the input text by going to:
    - remove links
    - remove special characters
    - remove numbers
    - remove stopwords
    - convert to lowercase
    - remove excessive white spaces
    Arguments:
        text (str): text to clean
        remove_stopwords (bool): whether to remove stopwords
    Returns:
        str: cleaned text
    """
    #remove links
    text=re.sub(r"http\S+","",text)
    #remove numbers and special characters
    text=re.sub("[^A-Za-z]+"," ",text)
    #remove stopwords
    if remove_stopwords:
      #1.create tokens
      tokens=nltk.word_tokenize(text)
      #2.check if it's a stopword
      tokens=[w.lower().strip() for w in tokens if not w.lower() in stopwords.words("italian")]
      #return a list of cleaned tokens
      return tokens

In [62]:
import pandas as pd

In [63]:
df=pd.read_csv('vehicle_data.csv')
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [64]:
#Data Preprocessing

In [65]:
#Gensim word2vec requires that a format of "list of lists" for training where every document is contained in a list
#and every list contains lists of tokens of that document

In [66]:
#a. create a new column for Make Model
df['Maker_Model']=df['Make']+" "+df['Model']

In [67]:
#b. Generate a format of ' list of lists' for each Make Model with the following features: Engine Fuel Type,
#Transmission Type, Driven_Wheels,Market Category, Vehicle Size, Vehicle Style.
#Select features from original dataset to form a new dataframe.
df1=df[['Engine Fuel Type','Transmission Type','Driven_Wheels','Market Category','Vehicle Size', 'Vehicle Style', 'Maker_Model']]
#For each row, combine all the columns into one column
df2=df1.apply(lambda x:','.join(x.astype(str)),axis=1)
#Store them in a pandas dataframe
df_clean=pd.DataFrame({"clean":df2})
#Create the list of list format of the custom corpus for gensim modeling
sent=[row.split(',') for row in df_clean['clean']]
#show the example of list of list format of the custom corpus for gensim modeling
sent[0:2]

[['premium unleaded (required)',
  'MANUAL',
  'rear wheel drive',
  'Factory Tuner',
  'Luxury',
  'High-Performance',
  'Compact',
  'Coupe',
  'BMW 1 Series M'],
 ['premium unleaded (required)',
  'MANUAL',
  'rear wheel drive',
  'Luxury',
  'Performance',
  'Compact',
  'Convertible',
  'BMW 1 Series']]

In [68]:
#Gensim word2vec Model Training

In [69]:
from gensim.models import Word2Vec

In [70]:
model=Word2Vec(sent,min_count=1,size = 50, workers=3, window=3, sg=1)
#size:The number of dimensions of the embeddings and the default is 100.
#window:The maximum distance between a target word and words around the target word. The default window is 5.
#min_count:The minimum count of words to consider when traning the model;
#words with occurrence less than this count will be ignored. the default of min_count is 5.
#workers:The number of partitions during training and the default workers is 3.
#sg:The training algorithm, either CBOW(0) or skip gram(1). The default training algorithm is CBOW.

In [71]:
#we can obtain the word embedding directly from the training model as following
model.wv['BMW 1 Series']

array([ 0.09216227,  0.19068903, -0.02150614,  0.09567162, -0.0666446 ,
        0.17018315, -0.03472868, -0.29696077,  0.05088   ,  0.10508802,
        0.16257487,  0.02611272, -0.05485084, -0.08982798, -0.09903333,
        0.13983737,  0.06965902,  0.16133045, -0.00907501, -0.18733238,
       -0.18858166,  0.12406412,  0.38015574, -0.02359857,  0.17559499,
        0.11710597,  0.12795523,  0.14144185,  0.01887907,  0.23545916,
       -0.16702896,  0.10742594, -0.0272628 , -0.07660717, -0.02991791,
       -0.04109486, -0.08410483, -0.04563307,  0.13849543, -0.04512772,
        0.04072719, -0.12713599,  0.00270061,  0.11681359, -0.33426094,
       -0.05680935, -0.02060897, -0.01495106,  0.29567072, -0.0851198 ],
      dtype=float32)

In [72]:
model.wv['Toyota Camry']

array([ 0.0229073 ,  0.13333777, -0.07669047, -0.09007536, -0.17804268,
        0.08373772, -0.20580278, -0.29854497,  0.05160572,  0.07090168,
        0.05320791, -0.02274852, -0.14925429, -0.08120891, -0.057316  ,
        0.11297707,  0.07359096,  0.16323012,  0.06860267, -0.14224744,
       -0.20807597,  0.05458144,  0.32116053, -0.07240757,  0.18299225,
        0.13171963,  0.08442681,  0.10034982,  0.11134163,  0.24588837,
       -0.14786996,  0.16141383,  0.086664  , -0.10797783, -0.01614798,
       -0.0042346 ,  0.00478714,  0.11073566,  0.0505663 , -0.10082871,
        0.01883443, -0.25195897,  0.01977758,  0.01190953, -0.12528488,
        0.00498638,  0.01199733, -0.09440795,  0.21619925, -0.08693031],
      dtype=float32)

In [73]:
#Compute Similarities

In [74]:
model.wv.similarity('BMW 1 Series','Toyota Camry')

0.83560675

In [75]:
model.wv.similarity('BMW 1 Series','BMW 1 Series M')

0.95279706

In [76]:
model.wv.most_similar('BMW 1 Series')

[('Lamborghini Murcielago', 0.9956990480422974),
 ('Ferrari 575M', 0.9956826567649841),
 ('BMW 2 Series', 0.9953921437263489),
 ('Infiniti G Coupe', 0.9952536225318909),
 ('Cadillac ATS Coupe', 0.9949115514755249),
 ('Ferrari 458 Italia', 0.9946121573448181),
 ('Aston Martin V12 Vantage', 0.9943667054176331),
 ('Lamborghini Aventador', 0.9942981600761414),
 ('Lotus Elise', 0.994263231754303),
 ('Maserati GranTurismo Convertible', 0.9940467476844788)]

In [77]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

In [78]:
def cosine_distance(model,word, target_list, num):
    cosine_dict={}
    word_list=[]
    a=model.wv[word]
    for item in target_list:
      if item !=word:
         b=model.wv[item]
         cos_sim=dot(a,b)/(norm(a)*norm(b))
         cosine_dict[item]=cos_sim
    dist_sort=sorted(cosine_dict.items(),key=lambda dist:dist[1],reverse=True) #in Desending order
    for item in dist_sort:
        word_list.append((item[0],item[1]))
    return word_list[0:num]


In [79]:
Maker_Model=list(df.Maker_Model.unique())
#Show the most similar Mercedes-Benz SLK-Class by cosine distance
cosine_distance(model,'Mercedes-Benz SLK-Class', Maker_Model,5)

[('Audi RS 5', 0.9956105),
 ('Maserati Coupe', 0.9951812),
 ('Audi S7', 0.9941357),
 ('BMW ALPINA B7', 0.99374914),
 ('Mercedes-Benz SL-Class', 0.99326646)]

In [80]:
Maker_Model=list(df.Maker_Model.unique())
#Show the most similar BMW 1 Series by cosine distance
cosine_distance(model,'BMW 1 Series', Maker_Model,5)
#The result same as above

[('Lamborghini Murcielago', 0.9956991),
 ('Ferrari 575M', 0.99568254),
 ('BMW 2 Series', 0.9953921),
 ('Infiniti G Coupe', 0.9952537),
 ('Cadillac ATS Coupe', 0.9949115)]

In [81]:
#2.Example Reading a Wikipedia Page

In [82]:
import urllib
import bs4 as bs

In [83]:
source=urllib.request.urlopen('https://en.wikipedia.org/wiki/Fourth_Industrial_Revolution').read()#we have the source

In [84]:
soup=bs.BeautifulSoup(source,'lxml')#we have to parse the source and lxml parser

In [85]:
#Preprocess

In [86]:
text=""
for paragraph in soup.find_all('p'):
    text+=paragraph.text
#preprocess
text=re.sub(r'\[[0-9]*\]',' ',text)
text=re.sub(r'\s+',' ',text)
text=text.lower()
#text=re.sub(r'\W',' ',text)
text=re.sub(r'[@#$&%\*\(\)\>\<\?\'\":;\]\[-]',' ',text)
text=re.sub(r'\d',' ',text)
text=re.sub(r'\s+',' ',text)

In [87]:
text

' south asia middle east europe north america the fourth industrial revolution, ir, or industry . , conceptualises rapid change to technology, industries, and societal patterns and processes in the st century due to increasing interconnectivity and smart automation. the term was popularised in by klaus schwab, the world economic forum founder and executive chairman, and has since been used in numerous economic, political, and scientific articles in reference to the current era of emerging high technology. schwab asserts that the changes seen are more than just improvements to efficiency, but express a significant shift in industrial capitalism. a part of this phase of industrial change is the joining of technologies like artificial intelligence, gene editing, to advanced robotics that blur the lines between the physical, digital, and biological worlds. throughout this, fundamental shifts are taking place in how the global production and supply network operates through ongoing automatio

In [88]:
sentences=nltk.sent_tokenize(text)
sentences=[nltk.word_tokenize(sentence) for sentence in sentences]
for i in range(len(sentences)):
  sentences[i]=[word for word in sentences[i] if word not in stopwords.words('english')]

In [89]:
len(sentences)

163

In [90]:
#Training the model

In [91]:
model=Word2Vec(sentences, min_count=1)

In [92]:
#get vectors out of words
vectors=model.wv['industry']
vectors

array([-0.00212534, -0.00243353, -0.00168151,  0.002954  ,  0.00454779,
        0.00247998, -0.00121604, -0.00439633, -0.00103521,  0.00187999,
       -0.00433913,  0.001448  ,  0.00362037,  0.00239897,  0.00343704,
        0.00263873, -0.00444228,  0.00450617, -0.00431435,  0.00315072,
        0.0028417 ,  0.003653  , -0.00173196, -0.00286052, -0.00182602,
        0.00014457, -0.00195935, -0.00167845,  0.00057841, -0.00308413,
        0.00024734, -0.0022454 , -0.00344766, -0.00036408,  0.00529569,
       -0.00279005, -0.00266025, -0.0003851 , -0.00215562, -0.00029259,
       -0.00321924, -0.00139095,  0.00385901, -0.00341499, -0.00148793,
       -0.00326575, -0.00461053, -0.00350891,  0.00037935, -0.00106673,
       -0.00365787, -0.00079188,  0.0011684 , -0.0049711 ,  0.00245471,
        0.00126932, -0.00032543,  0.0036443 ,  0.00225003,  0.00283821,
        0.00102421,  0.00547315, -0.00331758,  0.00290242,  0.00083721,
       -0.00418548,  0.00373045, -0.00496957, -0.00287534, -0.00

In [93]:
len(vectors)

100

In [94]:
model.wv.most_similar('revolution')

[('implementation', 0.3268631100654602),
 ('major', 0.32656213641166687),
 (',', 0.3114866614341736),
 ('mochon', 0.30380934476852417),
 ('industrial', 0.2921789884567261),
 ('megatrends', 0.28594398498535156),
 ('products–helps', 0.2703874409198761),
 ('end', 0.2645798921585083),
 ('several', 0.24971511960029602),
 ('printing', 0.24756386876106262)]

In [95]:
#Improve the model, explore some of the pre-trained models

In [96]:
from gensim.models import KeyedVectors

In [97]:
filename='GoogleNews-vectors-negative300.bin'

In [98]:
#Get the model- it is going to take a while with good RAM
model=KeyedVectors.load_word2vec_format(filename,binary=True) #bc it is binary file



In [99]:
model.most_similar('revolution')

[('revolutions', 0.7037397623062134),
 ('revolutionary', 0.6257899403572083),
 ('Revolution', 0.5847131013870239),
 ('revolutionaries', 0.5730563402175903),
 ('revolt', 0.5643018484115601),
 ('technological_revolutions', 0.5470391511917114),
 ('counterrevolution', 0.5448992252349854),
 ('uprisings', 0.5448099970817566),
 ('bloodless_revolution', 0.5400627851486206),
 ('uprising', 0.5311056971549988)]

In [100]:
model.most_similar('king')
#let's add different vectors and negation vectors, and find out
model.most_similar(positive=['king','woman'],negative=['man'])

[('queen', 0.7118192315101624),
 ('monarch', 0.6189674735069275),
 ('princess', 0.5902431607246399),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235946178436279),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411403656006)]