<a href="https://colab.research.google.com/github/royarkaofficial/DataOps-Lab10-31th-Dec-2024-Feature-Engineering/blob/main/Copy_of_Lab10_Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Load the dataset into the dataframe

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSV Files/sample_car_sales_data.csv')
df.head()

Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,Description
0,23437,342,Honda,Accord,2023-03-13,Luxury model with premium features
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage, excellent condition"
2,44113,351,Honda,Civic,2024-03-04,"Low mileage, excellent condition"
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior, no accidents"
4,28310,172,Audi,A3,2024-03-24,"Low mileage, excellent condition"


**Creating the new features**

In [4]:
#Method 1 : Interaction term
df['price_per_horsepower'] = df['Price'] / df['Horsepower']
df.head()

Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,Description,price_per_horsepower
0,23437,342,Honda,Accord,2023-03-13,Luxury model with premium features,68.52924
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage, excellent condition",82.284091
2,44113,351,Honda,Civic,2024-03-04,"Low mileage, excellent condition",125.678063
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior, no accidents",180.384328
4,28310,172,Audi,A3,2024-03-24,"Low mileage, excellent condition",164.593023


In [5]:
from sklearn.preprocessing import PolynomialFeatures

In [10]:
# Method 2: Polynomial Features

# List of numerical columns for which polynomial features will be created
num_col = ["Price", "Horsepower"]

# The `num_col` variable contains the names of the numerical columns.
# These are the columns for which we will generate polynomial features in the next steps.
num_col

['Price', 'Horsepower']

In [11]:
# Initialize the PolynomialFeatures transformer
# degree=2: This generates polynomial features up to the second degree (e.g., x, x^2, x1*x2, etc.)
# include_bias=False: This excludes the bias (constant) term from the transformed feature set
poly = PolynomialFeatures(degree=2, include_bias=False)

# Comment on the transformation:
# When we apply this transformer on data:
# - For each feature (x), it generates the original feature (x) and also its square (x^2).
# - It also generates the interaction terms between features (e.g., x1 * x2) for degree > 1.
# Thus, with degree=2, the features generated would be:
# - Original features: x (first column), y (second column)
# - Squared features: x^2, y^2
# - Interaction feature: x*y

# Example:
# Input: X = [[2, 3], [4, 5]]
# Transformed output would include: [2, 3, 2^2, 3^2, 2*3] => [2, 3, 4, 9, 6]

In [12]:
# Apply PolynomialFeatures transformation to the selected numerical columns from the DataFrame (df)
# `num_col` contains the list of columns that we want to transform (e.g., 'Price', 'Horsepower')

# `fit_transform()` method:
# - `fit()` calculates the necessary transformations (such as generating polynomial features for degree 2).
# - `transform()` applies the transformation to the data in df[num_col], creating new polynomial features.

# poly_feature will now contain the original features along with the newly generated polynomial features.

poly_feature = poly.fit_transform(df[num_col])

# Display the transformed features
# poly_feature now contains the original columns and their polynomial transformations (e.g., squared terms, interaction terms)
poly_feature

array([[2.34370000e+04, 3.42000000e+02, 5.49292969e+08, 8.01545400e+06,
        1.16964000e+05],
       [2.17230000e+04, 2.64000000e+02, 4.71888729e+08, 5.73487200e+06,
        6.96960000e+04],
       [4.41130000e+04, 3.51000000e+02, 1.94595677e+09, 1.54836630e+07,
        1.23201000e+05],
       [4.83430000e+04, 2.68000000e+02, 2.33704565e+09, 1.29559240e+07,
        7.18240000e+04],
       [2.83100000e+04, 1.72000000e+02, 8.01456100e+08, 4.86932000e+06,
        2.95840000e+04],
       [3.61440000e+04, 2.75000000e+02, 1.30638874e+09, 9.93960000e+06,
        7.56250000e+04],
       [2.36650000e+04, 2.14000000e+02, 5.60032225e+08, 5.06431000e+06,
        4.57960000e+04],
       [4.53380000e+04, 2.51000000e+02, 2.05553424e+09, 1.13798380e+07,
        6.30010000e+04],
       [2.81200000e+04, 2.87000000e+02, 7.90734400e+08, 8.07044000e+06,
        8.23690000e+04],
       [2.39400000e+04, 3.62000000e+02, 5.73123600e+08, 8.66628000e+06,
        1.31044000e+05],
       [3.17570000e+04, 2.7200

In [14]:
# Step 1: Convert the polynomial features into a DataFrame
# `poly_feature` contains the transformed polynomial features (including original features, squared terms, and interaction terms).
# `poly.get_feature_names_out(num_col)` generates the names for each feature in the transformed data, like 'Price', 'Horsepower', 'Price^2', 'Price Horsepower', 'Horsepower^2'

df_poly = pd.DataFrame(poly_feature, columns = poly.get_feature_names_out(num_col))

# Step 2: Display the first few rows of the new DataFrame with polynomial features
df_poly.head()

Unnamed: 0,Price,Horsepower,Price^2,Price Horsepower,Horsepower^2
0,23437.0,342.0,549293000.0,8015454.0,116964.0
1,21723.0,264.0,471888700.0,5734872.0,69696.0
2,44113.0,351.0,1945957000.0,15483663.0,123201.0
3,48343.0,268.0,2337046000.0,12955924.0,71824.0
4,28310.0,172.0,801456100.0,4869320.0,29584.0


**Categorical values**

In [15]:
#Method 1 : Label encoding
from sklearn.preprocessing import LabelEncoder

In [16]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSV Files/sample_car_sales_data.csv')
df.head()

Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,Description
0,23437,342,Honda,Accord,2023-03-13,Luxury model with premium features
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage, excellent condition"
2,44113,351,Honda,Civic,2024-03-04,"Low mileage, excellent condition"
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior, no accidents"
4,28310,172,Audi,A3,2024-03-24,"Low mileage, excellent condition"


In [18]:
# Step 1: Initialize the LabelEncoder
# LabelEncoder is used to convert categorical labels (e.g., strings) into numerical labels
label_encoder = LabelEncoder()

# Step 2: Apply LabelEncoder to the 'Brand' column
# `fit_transform()` learns the unique values in the 'Brand' column and encodes them as integers
# Each unique category in 'Brand' will be assigned a different integer (starting from 0)
df["Brand_encoded"] = label_encoder.fit_transform(df["Brand"])

# Step 3: Display the first few rows to see the result of encoding
df.head()

# Step 4: The full DataFrame with the newly encoded 'Brand' column
df

Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,Description,Brand_encoded
0,23437,342,Honda,Accord,2023-03-13,Luxury model with premium features,3
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage, excellent condition",4
2,44113,351,Honda,Civic,2024-03-04,"Low mileage, excellent condition",3
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior, no accidents",3
4,28310,172,Audi,A3,2024-03-24,"Low mileage, excellent condition",0
...,...,...,...,...,...,...,...
145,20079,308,Audi,Q5,2022-07-22,"Sport package, fully loaded",0
146,23492,390,Honda,CR-V,2024-06-29,Luxury model with premium features,3
147,22063,119,BMW,X3,2021-12-24,"Low mileage, excellent condition",1
148,32704,273,BMW,X5,2022-07-22,"Sport package, fully loaded",1


In [19]:
df.drop(columns=["Brand"],inplace = True)
df.head()

Unnamed: 0,Price,Horsepower,Model,Date_of_Sale,Description,Brand_encoded
0,23437,342,Accord,2023-03-13,Luxury model with premium features,3
1,21723,264,Prius,2023-06-14,"Low mileage, excellent condition",4
2,44113,351,Civic,2024-03-04,"Low mileage, excellent condition",3
3,48343,268,CR-V,2024-10-12,"Clean interior, no accidents",3
4,28310,172,A3,2024-03-24,"Low mileage, excellent condition",0


In [20]:
#Method 2 : One Hot enconding
from sklearn.preprocessing import OneHotEncoder

In [21]:
# Step 1: Initialize the OneHotEncoder
# `sparse_output=False`: This ensures that the output is a dense array (regular NumPy array).
# If set to True, the output would be a sparse matrix, which is more memory efficient, especially for large datasets with many zeros.
OHE = OneHotEncoder(sparse_output=False)

# Step 2: Apply OneHotEncoder to the 'Model' column
# `fit_transform()` learns the unique values in the 'Model' column and encodes them as a one-hot encoded matrix.
# Each unique value in 'Model' is transformed into a separate column, with a 1 indicating the presence of that value in a particular row and 0 otherwise.
Mode_Encoded = OHE.fit_transform(df[["Model"]])

# Step 3: Display the one-hot encoded result (dense array)
# This will print the one-hot encoded matrix (dense array), where each row corresponds to an entry in the 'Model' column and each column corresponds to a unique model.
print(Mode_Encoded)


[[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [22]:
# Step 1: Convert the one-hot encoded array into a DataFrame
# `Mode_Encoded` contains the one-hot encoded data, and `OHE.get_feature_names_out(["Model"])` provides the column names for the new DataFrame
# These column names are the unique categories from the 'Model' column.
df_Mode_encoded = pd.DataFrame(Mode_Encoded, columns = OHE.get_feature_names_out(["Model"]))

# Step 2: Display the first few rows of the newly created one-hot encoded DataFrame
# `df_Mode_encoded.head()` will show the first 5 rows of the DataFrame with one-hot encoded features.
df_Mode_encoded.head()


Unnamed: 0,Model_A3,Model_A4,Model_Accord,Model_CR-V,Model_Camry,Model_Civic,Model_Corolla,Model_Explorer,Model_Fiesta,Model_Fit,Model_Focus,Model_M3,Model_Mustang,Model_Prius,Model_Q5,Model_RAV4,Model_TT,Model_X3,Model_X5,Model_i8
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df_Mode_encoded

Unnamed: 0,Model_A3,Model_A4,Model_Accord,Model_CR-V,Model_Camry,Model_Civic,Model_Corolla,Model_Explorer,Model_Fiesta,Model_Fit,Model_Focus,Model_M3,Model_Mustang,Model_Prius,Model_Q5,Model_RAV4,Model_TT,Model_X3,Model_X5,Model_i8
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
146,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [24]:
df.drop(columns=["Model"],inplace = True)
df.head()

Unnamed: 0,Price,Horsepower,Date_of_Sale,Description,Brand_encoded
0,23437,342,2023-03-13,Luxury model with premium features,3
1,21723,264,2023-06-14,"Low mileage, excellent condition",4
2,44113,351,2024-03-04,"Low mileage, excellent condition",3
3,48343,268,2024-10-12,"Clean interior, no accidents",3
4,28310,172,2024-03-24,"Low mileage, excellent condition",0


In [25]:
df = pd.concat([df,df_Mode_encoded],axis=1)
df.head()

Unnamed: 0,Price,Horsepower,Date_of_Sale,Description,Brand_encoded,Model_A3,Model_A4,Model_Accord,Model_CR-V,Model_Camry,...,Model_Focus,Model_M3,Model_Mustang,Model_Prius,Model_Q5,Model_RAV4,Model_TT,Model_X3,Model_X5,Model_i8
0,23437,342,2023-03-13,Luxury model with premium features,3,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,21723,264,2023-06-14,"Low mileage, excellent condition",4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,44113,351,2024-03-04,"Low mileage, excellent condition",3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,48343,268,2024-10-12,"Clean interior, no accidents",3,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28310,172,2024-03-24,"Low mileage, excellent condition",0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
df.to_csv("Encoded_car_sales_data.csv")

**Handling Temporal Features(Date and Time based info)**

In [28]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSV Files/sample_car_sales_data.csv')
df.head()

Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,Description
0,23437,342,Honda,Accord,2023-03-13,Luxury model with premium features
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage, excellent condition"
2,44113,351,Honda,Civic,2024-03-04,"Low mileage, excellent condition"
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior, no accidents"
4,28310,172,Audi,A3,2024-03-24,"Low mileage, excellent condition"


In [29]:
print(df["Date_of_Sale"].dtypes)

object


In [31]:
# Step 1: Convert the 'Date_of_Sale' column to datetime format
# The pd.to_datetime() function converts the column into a datetime object, which allows for easier manipulation of dates.
df["Date_of_Sale"] = pd.to_datetime(df["Date_of_Sale"])

# Step 2: Print the data type of the 'Date_of_Sale' column
# This will display the datatype of the column after conversion. It should show 'datetime64[ns]', indicating that the column is now in datetime format.
print(df["Date_of_Sale"].dtypes)

datetime64[ns]


In [32]:
# Step 1: Extract the Year from 'Date_of_Sale'
# The '.dt.year' attribute extracts the year part from the 'Date_of_Sale' column
df["Sales_Year"] = df["Date_of_Sale"].dt.year

# Step 2: Extract the Month from 'Date_of_Sale'
# The '.dt.month' attribute extracts the month part from the 'Date_of_Sale' column
df["Sales_Month"] = df["Date_of_Sale"].dt.month

# Step 3: Extract the Day from 'Date_of_Sale'
# The '.dt.day' attribute extracts the day part from the 'Date_of_Sale' column
df["Sales_Day"] = df["Date_of_Sale"].dt.day

# Step 4: Display the first few rows of the DataFrame to confirm the new columns
df.head()


Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,Description,Sales_Year,Sales_Month,Sales_Day
0,23437,342,Honda,Accord,2023-03-13,Luxury model with premium features,2023,3,13
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage, excellent condition",2023,6,14
2,44113,351,Honda,Civic,2024-03-04,"Low mileage, excellent condition",2024,3,4
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior, no accidents",2024,10,12
4,28310,172,Audi,A3,2024-03-24,"Low mileage, excellent condition",2024,3,24


In [33]:
print(df["Sales_Year"].dtypes)

int32


In [34]:
df.drop(columns=["Date_of_Sale"],inplace = True)
df.head()

Unnamed: 0,Price,Horsepower,Brand,Model,Description,Sales_Year,Sales_Month,Sales_Day
0,23437,342,Honda,Accord,Luxury model with premium features,2023,3,13
1,21723,264,Toyota,Prius,"Low mileage, excellent condition",2023,6,14
2,44113,351,Honda,Civic,"Low mileage, excellent condition",2024,3,4
3,48343,268,Honda,CR-V,"Clean interior, no accidents",2024,10,12
4,28310,172,Audi,A3,"Low mileage, excellent condition",2024,3,24


**TF-IDF**

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSV Files/sample_car_sales_data.csv')
df.head()

Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,Description
0,23437,342,Honda,Accord,2023-03-13,Luxury model with premium features
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage, excellent condition"
2,44113,351,Honda,Civic,2024-03-04,"Low mileage, excellent condition"
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior, no accidents"
4,28310,172,Audi,A3,2024-03-24,"Low mileage, excellent condition"


In [57]:
# Step 1: Initialize the TfidfVectorizer
# The TfidfVectorizer is used to convert a collection of text documents (like a description column) into a matrix of TF-IDF features.
# TF-IDF stands for Term Frequency - Inverse Document Frequency and is a statistic that helps identify important words in a document relative to the entire corpus.
tfidf = TfidfVectorizer()

# Step 2: Apply the TfidfVectorizer to the 'Description' column
# The 'fit_transform()' method learns the vocabulary of the 'Description' column and transforms the text into numerical features.
# Each document (row) in the 'Description' column is transformed into a sparse matrix where each column corresponds to a unique word in the corpus.
desc_tfidf = tfidf.fit_transform(df["Description"])

# 'desc_tfidf' is a sparse matrix containing the TF-IDF features, which can be converted into a dense matrix or DataFrame for further analysis.

In [51]:
# Step 1: Convert the sparse matrix to a DataFrame
# The 'desc_tfidf' is a sparse matrix, and we use 'toarray()' to convert it into a dense matrix (a NumPy array).
# The resulting array is then used to create a pandas DataFrame where each row corresponds to a document (a description) and each column to a unique word.
df_tfidf = pd.DataFrame(desc_tfidf.toarray(), columns = tfidf.get_feature_names_out())

# Step 2: Display the first few rows of the DataFrame to check the result
# This will show the first 5 rows of the DataFrame, which contains the TF-IDF values for each word in the descriptions.
df_tfidf.head()

Unnamed: 0,accidents,clean,condition,excellent,features,fully,interior,loaded,low,luxury,...,mileage,model,no,one,owner,package,premium,sport,well,with
0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.447214,...,0.0,0.447214,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.447214
1,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.5,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.5,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.5,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.5,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
df.drop(columns=["Description"],inplace = True)

In [59]:
df = pd.concat([df,df_tfidf],axis=1)
df.head()

Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,accidents,clean,condition,excellent,features,...,mileage,model,no,one,owner,package,premium,sport,well,with
0,23437,342,Honda,Accord,2023-03-13,0.0,0.0,0.0,0.0,0.447214,...,0.0,0.447214,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.447214
1,21723,264,Toyota,Prius,2023-06-14,0.0,0.0,0.5,0.5,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,44113,351,Honda,Civic,2024-03-04,0.0,0.0,0.5,0.5,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,48343,268,Honda,CR-V,2024-10-12,0.5,0.5,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28310,172,Audi,A3,2024-03-24,0.0,0.0,0.5,0.5,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Stop words Removal**

In [60]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSV Files/sample_car_sales_data.csv')
df.head()

Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,Description
0,23437,342,Honda,Accord,2023-03-13,Luxury model with premium features
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage, excellent condition"
2,44113,351,Honda,Civic,2024-03-04,"Low mileage, excellent condition"
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior, no accidents"
4,28310,172,Audi,A3,2024-03-24,"Low mileage, excellent condition"


In [62]:
# Step 1: Importing necessary libraries from NLTK
# NLTK (Natural Language Toolkit) is a popular library for text processing and natural language processing (NLP).
# The `stopwords` module provides a list of common words (like "and", "the", etc.) that are usually removed from text for NLP tasks.
# The `word_tokenize` module is used to split a text into individual words (tokens).

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [65]:
# Step 1: Downloading NLTK datasets for text processing

# Download the 'punkt' tokenizer models for splitting text into words or sentences.
# This is necessary for the word_tokenize and sent_tokenize functions in NLTK.
nltk.download('punkt')

# Download the 'punkt_tab' dataset (a tabbed version of the punkt data).
# This is not typically required in most cases, but if needed, it can be used for tokenization.
nltk.download('punkt_tab')

# Download the 'stopwords' corpus, which contains a list of common words in various languages (like "the", "and", "is").
# These are usually removed from text as they don't carry significant meaning in analysis.
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [66]:
# Function to remove stopwords from a text
def remove_stopwords(text):
    # Step 1: Tokenize the input text into individual words (tokens)
    words = word_tokenize(text)

    # Step 2: Filter out any word that is in the list of stopwords
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]

    # Step 3: Join the filtered words back into a single string
    return ' '.join(filtered_words)
    # This joins the words in the filtered list into a string with spaces in between each word.
    # The ' '.join() function helps combine the words into one continuous string.

In [69]:
# Apply the 'remove_stopwords' function to the 'Description' column and store the result in a new column 'filtered_description'
df["filtered_description"] = df["Description"].apply(remove_stopwords)

# Display the first few rows of the DataFrame to check the new 'filtered_description' column
df.head()

Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,Description,filtered_description
0,23437,342,Honda,Accord,2023-03-13,Luxury model with premium features,Luxury model premium features
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage, excellent condition","Low mileage , excellent condition"
2,44113,351,Honda,Civic,2024-03-04,"Low mileage, excellent condition","Low mileage , excellent condition"
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior, no accidents","Clean interior , accidents"
4,28310,172,Audi,A3,2024-03-24,"Low mileage, excellent condition","Low mileage , excellent condition"


**Bag of Words(BOW)**

In [68]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSV Files/sample_car_sales_data.csv')
df.head()

Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,Description
0,23437,342,Honda,Accord,2023-03-13,Luxury model with premium features
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage, excellent condition"
2,44113,351,Honda,Civic,2024-03-04,"Low mileage, excellent condition"
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior, no accidents"
4,28310,172,Audi,A3,2024-03-24,"Low mileage, excellent condition"


In [70]:
from sklearn.feature_extraction.text import CountVectorizer

In [71]:
# Initialize the CountVectorizer object
bow = CountVectorizer()

# Apply the CountVectorizer on the 'Description' column to create a BoW representation
df_bow = bow.fit_transform(df["Description"])

# Print the resulting Bag of Words matrix (a sparse matrix format)
print(df_bow)

  (0, 9)	1
  (0, 12)	1
  (0, 20)	1
  (0, 17)	1
  (0, 4)	1
  (1, 8)	1
  (1, 11)	1
  (1, 3)	1
  (1, 2)	1
  (2, 8)	1
  (2, 11)	1
  (2, 3)	1
  (2, 2)	1
  (3, 1)	1
  (3, 6)	1
  (3, 13)	1
  (3, 0)	1
  (4, 8)	1
  (4, 11)	1
  (4, 3)	1
  (4, 2)	1
  (5, 14)	1
  (5, 15)	1
  (5, 19)	1
  (5, 10)	1
  :	:
  (144, 14)	1
  (144, 15)	1
  (144, 19)	1
  (144, 10)	1
  (145, 18)	1
  (145, 16)	1
  (145, 5)	1
  (145, 7)	1
  (146, 9)	1
  (146, 12)	1
  (146, 20)	1
  (146, 17)	1
  (146, 4)	1
  (147, 8)	1
  (147, 11)	1
  (147, 3)	1
  (147, 2)	1
  (148, 18)	1
  (148, 16)	1
  (148, 5)	1
  (148, 7)	1
  (149, 8)	1
  (149, 11)	1
  (149, 3)	1
  (149, 2)	1


In [72]:
# Convert the sparse matrix to a dense array and create a DataFrame
df_bow = pd.DataFrame(df_bow.toarray(), columns = bow.get_feature_names_out())

# Display the first few rows of the resulting DataFrame
df_bow.head()

Unnamed: 0,accidents,clean,condition,excellent,features,fully,interior,loaded,low,luxury,...,mileage,model,no,one,owner,package,premium,sport,well,with
0,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,1
1,0,0,1,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,1,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,1,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0


In [73]:
df.drop(columns=["Description"],inplace = True)

In [74]:
df = pd.concat([df,df_bow],axis=1)
df.head()

Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,filtered_description,accidents,clean,condition,excellent,...,mileage,model,no,one,owner,package,premium,sport,well,with
0,23437,342,Honda,Accord,2023-03-13,Luxury model premium features,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage , excellent condition",0,0,1,1,...,1,0,0,0,0,0,0,0,0,0
2,44113,351,Honda,Civic,2024-03-04,"Low mileage , excellent condition",0,0,1,1,...,1,0,0,0,0,0,0,0,0,0
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior , accidents",1,1,0,0,...,0,0,1,0,0,0,0,0,0,0
4,28310,172,Audi,A3,2024-03-24,"Low mileage , excellent condition",0,0,1,1,...,1,0,0,0,0,0,0,0,0,0


Data normalization , Standardizatiom , Scaling

In [75]:
#Min-Max Scaler
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSV Files/sample_car_sales_data.csv')
df.head()

Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,Description
0,23437,342,Honda,Accord,2023-03-13,Luxury model with premium features
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage, excellent condition"
2,44113,351,Honda,Civic,2024-03-04,"Low mileage, excellent condition"
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior, no accidents"
4,28310,172,Audi,A3,2024-03-24,"Low mileage, excellent condition"


In [76]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Apply the MinMaxScaler to normalize the 'Horsepower' column and create a new column 'Normalized_Horsepower'
df["Normalized_Horsepower"] = scaler.fit_transform(df[["Horsepower"]])

# Display the first few rows of the DataFrame
df.head()

# Formula for MinMax Scaling
# x_scaled = (X - X.min(axis = 0)) / (X.max(axis = 0) - X.min(axis = 0))

Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,Description,Normalized_Horsepower
0,23437,342,Honda,Accord,2023-03-13,Luxury model with premium features,0.805369
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage, excellent condition",0.543624
2,44113,351,Honda,Civic,2024-03-04,"Low mileage, excellent condition",0.83557
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior, no accidents",0.557047
4,28310,172,Audi,A3,2024-03-24,"Low mileage, excellent condition",0.234899


In [77]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Apply the StandardScaler to standardize the 'Horsepower' column and create a new column 'Standardized_Horsepower'
df["Standardized_Horsepower"] = scaler.fit_transform(df[["Horsepower"]])

# Display the first few rows of the DataFrame
df.head()

# Formula for Standardization (Z-score Normalization)
# z = (x - u) / s
# z = standardized value
# x = original value
# u = mean of training samples
# s = standard deviation

Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,Description,Normalized_Horsepower,Standardized_Horsepower
0,23437,342,Honda,Accord,2023-03-13,Luxury model with premium features,0.805369,0.995471
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage, excellent condition",0.543624,0.112185
2,44113,351,Honda,Civic,2024-03-04,"Low mileage, excellent condition",0.83557,1.097389
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior, no accidents",0.557047,0.157482
4,28310,172,Audi,A3,2024-03-24,"Low mileage, excellent condition",0.234899,-0.92964


In [78]:
from sklearn.preprocessing import RobustScaler

# Initialize the RobustScaler
scaler = RobustScaler()

# Apply the RobustScaler to scale the 'Horsepower' column and create a new column 'Scaled_Horsepower'
df["Scaled_Horsepower"] = scaler.fit_transform(df[["Horsepower"]])

# Display the first few rows of the DataFrame
df.head()

# Formula for Robust Scaling
# X_scaled = (X - Q2) / IQR
# Q1 = 25th percentile, Q2 = 50th percentile (median), Q3 = 75th percentile
# IQR = Q3 - Q1


Unnamed: 0,Price,Horsepower,Brand,Model,Date_of_Sale,Description,Normalized_Horsepower,Standardized_Horsepower,Scaled_Horsepower
0,23437,342,Honda,Accord,2023-03-13,Luxury model with premium features,0.805369,0.995471,0.523169
1,21723,264,Toyota,Prius,2023-06-14,"Low mileage, excellent condition",0.543624,0.112185,0.056801
2,44113,351,Honda,Civic,2024-03-04,"Low mileage, excellent condition",0.83557,1.097389,0.576981
3,48343,268,Honda,CR-V,2024-10-12,"Clean interior, no accidents",0.557047,0.157482,0.080717
4,28310,172,Audi,A3,2024-03-24,"Low mileage, excellent condition",0.234899,-0.92964,-0.493274


Train Test Split

In [79]:
# Import necessary libraries
import pandas as pd
from sklearn.datasets import load_iris

# Load the Iris dataset from sklearn
iris = load_iris()

# Convert the dataset into a pandas DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Add the target (labels) column to the DataFrame
df["target"] = iris.target

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [80]:
# Import necessary libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Separate features (X) and target (y)
X = df.drop(columns=["target"])  # Drop the target column to get the feature columns
y = df["target"]  # The target column contains the species

# Split the dataset into training and testing sets (80:20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier()

# Train the model on the training data
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

# Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


In [81]:
# Import necessary libraries
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier()

# Perform cross-validation on the model using 5-fold cross-validation
scores = cross_val_score(clf, X, y, cv=5)

# Print the individual cross-validation scores for each fold
print("Cross-validation scores", scores)

# Print the mean of the cross-validation scores (average accuracy across all folds)
print("Mean accuracy", np.mean(scores))

Cross-validation scores [0.96666667 0.96666667 0.9        0.96666667 1.        ]
Mean accuracy 0.9600000000000002


Stratified sampling

In [82]:
# Import necessary libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Separate features (X) and target (y)
X = df.drop(columns=["target"])  # Drop the target column to get the feature columns
y = df["target"]  # The target column containing the species

# Split the dataset into training and testing sets using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier()

# Train the model on the training data
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

# Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.9333333333333333


In [84]:
#Making the target column imbalanced
# The code you provided is modifying the target column (y) to create an imbalanced dataset. Specifically, it is setting all instances
# where the target is 2 (which represents the "virginica" species in the Iris dataset) to 0, which represents the other species
#  (like "setosa" and "versicolor"). This causes the dataset to have an imbalanced distribution of target classes.
y_imbalanced = np.where(y==2 , 0 , y)

In [85]:
# The function np.bincount(y_imbalanced) counts the number of occurrences of each value in the y_imbalanced array, assuming that the
#  values are non-negative integers. This is particularly useful when analyzing the distribution of classes in a dataset, such as the
#  target variable in a classification task.
np.bincount(y_imbalanced)

array([100,  50])

In [86]:
# Splitting the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y_imbalanced, test_size=0.2, random_state=42)
# X: Features of the dataset (excluding target column)
# y_imbalanced: The target column with imbalanced classes
# test_size=0.2: Specifies that 20% of the data will be used for testing, 80% for training
# random_state=42: Ensures that the split is reproducible by using a fixed random seed

# Initialize the DecisionTreeClassifier
clf = DecisionTreeClassifier()

# Train the classifier using the training data
clf.fit(X_train, y_train)
# X_train: Features of the training data
# y_train: Target values for the training data

# Make predictions on the test data using the trained classifier
y_pred = clf.predict(X_test)
# X_test: Features of the test data

# Calculate the accuracy of the model by comparing predicted values with the actual test labels
accuracy = accuracy_score(y_test, y_pred)
# y_test: Actual labels of the test data
# y_pred: Predicted labels by the model

# Output the accuracy score
print(f"Accuracy : ", accuracy)
# Displays the calculated accuracy of the model on the test data

Accuracy :  1.0


Train Validation Test Split

In [87]:
# Import necessary libraries
from sklearn.model_selection import train_test_split  # For splitting dataset into train, validation, and test sets
from sklearn.datasets import load_iris  # For loading the iris dataset
from sklearn.tree import DecisionTreeClassifier  # For applying Decision Tree model
from sklearn.metrics import accuracy_score  # For evaluating model performance

# Load the Iris dataset
iris = load_iris()  # Loading the Iris dataset
x = iris.data  # Features (attributes) of the dataset
y = iris.target  # Target labels of the dataset

# Split the data into 80% (Training + Validation) and 20% for testing
# X_train_val: 80% of data for training and validation, X_test: 20% for testing
# y_train_val: corresponding target labels for training and validation, y_test: target labels for testing
X_train_val, X_test, y_train_val, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Split the training and validation data into 75% (Training) and 25% (Validation)
# X_train: 75% of the training and validation data for training the model
# X_val: 25% of the training and validation data for validation
# y_train: corresponding target labels for training, y_val: corresponding target labels for validation
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Train the model using the DecisionTreeClassifier
clf = DecisionTreeClassifier()  # Initialize a DecisionTreeClassifier
clf.fit(X_train, y_train)  # Fit the model on the training data (X_train) and target labels (y_train)

In [88]:
# Initialize the testing with validation

# Predict the target labels for the validation set using the trained model
y_val_pred = clf.predict(X_val)  # Predict the target labels for the validation features (X_val)

# Calculate the accuracy of the model on the validation set
accuracy_val = accuracy_score(y_val, y_val_pred)  # Compare the predicted labels (y_val_pred) with actual labels (y_val) for validation

# Print the validation accuracy
print("Validation accuracy ", accuracy_val)  # Output the accuracy score of the model on the validation set

Validation accuracy  0.9


In [89]:
# Final testing with testing data (20%)

# Predict the target labels for the test set using the trained model
y_test_pred = clf.predict(X_test)  # Predict the target labels for the testing features (X_test)

# Calculate the accuracy of the model on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)  # Compare the predicted labels (y_test_pred) with actual labels (y_test) for testing

# Print the testing accuracy
print("Testing accuracy ", accuracy_test)  # Output the accuracy score of the model on the testing set

Testing accuracy  0.9333333333333333
