In [6]:
# data_preprocessing.ipynb

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler



In [None]:

# Load the dataset
df = pd.read_csv('../data/raw/property_data1.csv')


In [None]:

# Data Preprocessing
df['description'] = df['description'].fillna('')  # Handling missing descriptions
df['location'] = df['location'].fillna('')  # Handling missing locations

# Vectorize the text features (description and location)
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_desc = tfidf_vectorizer.fit_transform(df['description']).toarray()
X_loc = tfidf_vectorizer.fit_transform(df['location']).toarray()

# Combine text features
X_text = pd.concat([pd.DataFrame(X_desc), pd.DataFrame(X_loc)], axis=1)

# Process the price data
scaler = StandardScaler()
X_price = scaler.fit_transform(df[['price']])

# Combine all features
X = pd.concat([X_text, pd.DataFrame(X_price)], axis=1)

# Target variable
y = df['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save preprocessed data
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)