In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from nltk.corpus import stopwords

In [2]:
# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sakshi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [4]:
df = pd.read_csv("cars.csv")  # Ensure the file is in the working directory

print(df.head())


  Make       Model  Year             Engine Fuel Type  Engine HP  \
0  BMW  1 Series M  2011  premium unleaded (required)      335.0   
1  BMW    1 Series  2011  premium unleaded (required)      300.0   
2  BMW    1 Series  2011  premium unleaded (required)      300.0   
3  BMW    1 Series  2011  premium unleaded (required)      230.0   
4  BMW    1 Series  2011  premium unleaded (required)      230.0   

   Engine Cylinders Transmission Type     Driven_Wheels  Number of Doors  \
0               6.0            MANUAL  rear wheel drive              2.0   
1               6.0            MANUAL  rear wheel drive              2.0   
2               6.0            MANUAL  rear wheel drive              2.0   
3               6.0            MANUAL  rear wheel drive              2.0   
4               6.0            MANUAL  rear wheel drive              2.0   

                         Market Category Vehicle Size Vehicle Style  \
0  Factory Tuner,Luxury,High-Performance      Compact         C

In [6]:
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Use raw string for regex
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [7]:
# Apply preprocessing
df["clean_text"] = df["Market Category"].astype(str).apply(preprocess_text)

In [8]:
# Bag-of-Words (Count Vectorizer)
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df["clean_text"])

In [9]:
# Convert to DataFrame
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
print("Bag-of-Words (BoW) Matrix:")
print(bow_df.head())

Bag-of-Words (BoW) Matrix:
   crossover  crossoverdiesel  crossoverexoticluxuryhighperformance  \
0          0                0                                     0   
1          0                0                                     0   
2          0                0                                     0   
3          0                0                                     0   
4          0                0                                     0   

   crossoverexoticluxuryperformance  crossoverfactory  crossoverflex  \
0                                 0                 0              0   
1                                 0                 0              0   
2                                 0                 0              0   
3                                 0                 0              0   
4                                 0                 0              0   

   crossoverhatchback  crossoverhatchbackfactory  crossoverhatchbackluxury  \
0                   0              

In [10]:
# Normalized Count (TF)
X_bow_normalized = X_bow.toarray() / np.linalg.norm(X_bow.toarray(), axis=1, keepdims=True)
normalized_bow_df = pd.DataFrame(X_bow_normalized, columns=vectorizer.get_feature_names_out())
print("Normalized BoW:")
print(normalized_bow_df.head())

Normalized BoW:
   crossover  crossoverdiesel  crossoverexoticluxuryhighperformance  \
0        0.0              0.0                                   0.0   
1        0.0              0.0                                   0.0   
2        0.0              0.0                                   0.0   
3        0.0              0.0                                   0.0   
4        0.0              0.0                                   0.0   

   crossoverexoticluxuryperformance  crossoverfactory  crossoverflex  \
0                               0.0               0.0            0.0   
1                               0.0               0.0            0.0   
2                               0.0               0.0            0.0   
3                               0.0               0.0            0.0   
4                               0.0               0.0            0.0   

   crossoverhatchback  crossoverhatchbackfactory  crossoverhatchbackluxury  \
0                 0.0                        0

In [11]:
# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df["clean_text"])
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:")
print(tfidf_df.head())

TF-IDF Matrix:
   crossover  crossoverdiesel  crossoverexoticluxuryhighperformance  \
0        0.0              0.0                                   0.0   
1        0.0              0.0                                   0.0   
2        0.0              0.0                                   0.0   
3        0.0              0.0                                   0.0   
4        0.0              0.0                                   0.0   

   crossoverexoticluxuryperformance  crossoverfactory  crossoverflex  \
0                               0.0               0.0            0.0   
1                               0.0               0.0            0.0   
2                               0.0               0.0            0.0   
3                               0.0               0.0            0.0   
4                               0.0               0.0            0.0   

   crossoverhatchback  crossoverhatchbackfactory  crossoverhatchbackluxury  \
0                 0.0                        0.

In [12]:
# Word2Vec Model Training
sentences = [text.split() for text in df["clean_text"]]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

In [13]:
# Example: Word embedding for a sample word
word = "tunerluxury"  
if word in word2vec_model.wv:
    print(f"Word2Vec embedding for '{word}':", word2vec_model.wv[word])
else:
    print(f"'{word}' not found in Word2Vec vocabulary.")

Word2Vec embedding for 'tunerluxury': [ 9.0507627e-04  6.4770305e-03 -6.4871842e-03  6.9467081e-03
  1.0097923e-02  3.5949335e-03  3.5540434e-03  4.6282620e-03
 -2.0499863e-03 -8.2737878e-03  1.6538417e-03  5.1584817e-04
 -6.6134292e-03  1.9699475e-03  5.8008851e-03 -7.3629986e-03
  2.2750068e-03  7.3652314e-03 -8.8595133e-03  2.8714687e-03
 -8.6921090e-03 -7.6437937e-03  7.1683950e-03 -8.0605652e-03
 -1.3348481e-03 -7.7957073e-03 -7.7968929e-03 -7.1245488e-03
  1.3781819e-04  1.3669492e-03 -7.7214064e-03 -8.7740179e-03
 -3.0558482e-03 -1.9984511e-03 -6.4129625e-03 -3.8469916e-03
  3.7309532e-03  3.0830307e-03  6.0999831e-03  3.9166873e-03
 -1.1364750e-03  2.4373489e-03  8.8678058e-03  2.1793256e-03
  2.1238124e-03  7.0815551e-04 -9.0326313e-03  4.9239760e-03
  5.0107930e-03  2.6646428e-04  9.0237912e-03 -6.0921446e-03
  3.2548709e-03 -7.5186361e-03 -9.2814034e-03  1.9541003e-03
  5.0730356e-03  5.8909254e-03 -8.8809226e-03  4.3091187e-03
  2.6083004e-03  4.0412378e-03  7.2034309e-06  

In [14]:
# Save Word2Vec Model
word2vec_model.save("word2vec_car_model.model")
print("Word2Vec model saved successfully.")

Word2Vec model saved successfully.
