<a href="https://colab.research.google.com/github/royarkaofficial/DataOps-Lab8-31th-Dec-2024-Data-Preprocessing-and-Data-Extraction-Features/blob/main/Copy_of_Lab9_Data_Preprocessing_and_Text_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
file_path = "/content/sample_data/dataset_for_encoding_and_tfidf.csv"  # Adjust path if needed
df = pd.read_csv(file_path)

# Step 1: One-Hot Encoding for 'Category' Column
one_hot_encoder = OneHotEncoder(sparse_output=False)
encoded_categories = one_hot_encoder.fit_transform(df[['Category']])
category_columns = one_hot_encoder.get_feature_names_out(['Category'])
category_df = pd.DataFrame(encoded_categories, columns=category_columns)

# Add one-hot encoded columns to the original dataset
df = pd.concat([df, category_df], axis=1)

# Step 2: Split 'Date' into Day, Month, and Year
df['Date'] = pd.to_datetime(df['Date'])  # Convert to datetime
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

# Step 3: Apply TFIDF Vectorizer on 'TextData' Column
tfidf_vectorizer = TfidfVectorizer(max_features=100)  # Set max features to avoid high dimensionality
tfidf_features = tfidf_vectorizer.fit_transform(df['TextData'])
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Merge TFIDF features into the original dataset
df = pd.concat([df, tfidf_df], axis=1)

# Drop original 'Category', 'Date', and 'TextData' columns if no longer needed
df.drop(columns=['Category', 'Date', 'TextData'], inplace=True)

# Save the processed dataset
processed_file_path = "processed_dataset.csv"
df.to_csv(processed_file_path, index=False)

print(f"Preprocessing complete. Processed dataset saved at: {processed_file_path}")


Preprocessing complete. Processed dataset saved at: processed_dataset.csv
