In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data.csv')  # Replace with your actual CSV file path

# We'll use only text columns for NLP tasks
text_data = df[['Make', 'Model', 'Engine Fuel Type', 'Transmission Type', 'Driven_Wheels', 'Market Category', 'Vehicle Size', 'Vehicle Style']]

# Fill missing text data
text_data = text_data.fillna('')

# Combine all text columns into one sentence
text_data['combined'] = text_data.apply(lambda x: ' '.join(x), axis=1)


In [6]:
text_data.head()

Unnamed: 0,Make,Model,Engine Fuel Type,Transmission Type,Driven_Wheels,Market Category,Vehicle Size,Vehicle Style,combined
0,BMW,1 Series M,premium unleaded (required),MANUAL,rear wheel drive,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,BMW 1 Series M premium unleaded (required) MAN...
1,BMW,1 Series,premium unleaded (required),MANUAL,rear wheel drive,"Luxury,Performance",Compact,Convertible,BMW 1 Series premium unleaded (required) MANUA...
2,BMW,1 Series,premium unleaded (required),MANUAL,rear wheel drive,"Luxury,High-Performance",Compact,Coupe,BMW 1 Series premium unleaded (required) MANUA...
3,BMW,1 Series,premium unleaded (required),MANUAL,rear wheel drive,"Luxury,Performance",Compact,Coupe,BMW 1 Series premium unleaded (required) MANUA...
4,BMW,1 Series,premium unleaded (required),MANUAL,rear wheel drive,Luxury,Compact,Convertible,BMW 1 Series premium unleaded (required) MANUA...


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize Count Vectorizer
vectorizer = CountVectorizer()

# Fit and transform the text data
bow_counts = vectorizer.fit_transform(text_data['combined'])

print("Shape of BoW matrix:", bow_counts.shape)  # (rows, unique words)



Shape of BoW matrix: (11914, 844)


In [3]:
from sklearn.feature_extraction.text import TfidfTransformer

# Normalize using TfidfTransformer with only use_idf=False
tfidf_transformer = TfidfTransformer(use_idf=False)
bow_normalized = tfidf_transformer.fit_transform(bow_counts)

print("Shape of Normalized BoW matrix:", bow_normalized.shape)


Shape of Normalized BoW matrix: (11914, 844)


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the combined text
tfidf_matrix = tfidf_vectorizer.fit_transform(text_data['combined'])

print("Shape of TF-IDF matrix:", tfidf_matrix.shape)


Shape of TF-IDF matrix: (11914, 844)


In [5]:
#!pip install gensim

from gensim.models import Word2Vec

# First, split sentences into word tokens
sentences = text_data['combined'].apply(lambda x: x.split())

# Train Word2Vec model
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Example: Get vector for the word 'BMW'
vector_bmw = w2v_model.wv['BMW']
print("Embedding for BMW:", vector_bmw)


Embedding for BMW: [-0.18225496  0.6020876   0.39647877  1.1751533   0.64410603 -0.5874551
 -0.22516513  0.4741485  -0.5840205  -0.11880846  0.434283   -1.8218771
 -0.06110358  0.5310612   0.43532264  0.16007024 -1.3422482   0.20247546
  0.16355193 -0.2447205  -0.43463904  0.6121245  -0.3841615   0.27130064
 -0.2490312   0.21892323 -0.16148993 -0.6271491  -0.50012255 -1.3022652
  0.3611992  -0.86586803  0.5438447   0.96169156 -0.15098502  0.01142188
  0.37367788  0.25914764 -0.12819438 -0.7945345   0.045445    0.22409493
  0.41601828  0.05856746  0.16332445  0.20867242 -1.3441039  -0.78727955
  0.4517859  -0.10749671 -1.0498703  -0.10321072  0.5435131   0.5235063
  0.85756373  0.19272748  0.39396343 -0.00354763  0.45613804  0.285287
 -0.21719052  1.1160896   0.2138951  -0.666685   -0.41761795 -0.54979557
 -0.36089253 -0.0919225   0.01066202  0.46685982 -0.63128513  0.1400746
  0.33512884  0.28315637 -0.37324867  0.35900033 -0.41798657 -1.0942667
 -0.37515077 -0.4542813   0.19512643 -0.

In [None]:
text