# Feature Engineering on DK Housing Prices Dataset

## Identification of Attribute Types

In [ ]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('dk_housing_prices.csv')

# Display the first few rows of the dataset
df.head()

## Handling Missing Data

In [ ]:
# Check for missing data
missing_data = df.isnull().sum()
missing_data[missing_data > 0]

## Normalization of Numeric Attributes

In [ ]:
# Normalize numeric attributes
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = (df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std()
df.head()

## Analysis of the 5 Most Expensive Cities and Cities with Maximum Unique Zip Codes

In [ ]:
# 5 most expensive cities
expensive_cities = df.groupby('city')['purchase_price'].mean().sort_values(ascending=False).head(5)
expensive_cities

In [ ]:
# 5 cities with maximum unique zip codes
unique_zip_cities = df.groupby('city')['zip_code'].nunique().sort_values(ascending=False).head(5)
unique_zip_cities

## Proximity Measures

In [ ]:
# Proximity measure between house_type and sqm
house_type_sqm = df.groupby('house_type')['sqm'].mean()
house_type_sqm

In [ ]:
# Proximity measure between house_type and purchase_price
house_type_price = df.groupby('house_type')['purchase_price'].mean()
house_type_price

In [ ]:
# Proximity measure between no_rooms and purchase_price
rooms_price = df.groupby('no_rooms')['purchase_price'].mean()
rooms_price

## Feature Selection

In [ ]:
# Feature selection using filter methods
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

# Define independent variables and target variable
X = df.drop(columns=['purchase_price'])
y = df['purchase_price']

# Apply SelectKBest with f_regression
kbest_f = SelectKBest(score_func=f_regression, k=5)
kbest_f.fit(X, y)
features_f = X.columns[kbest_f.get_support()]
features_f

In [ ]:
# Apply SelectKBest with mutual_info_regression
kbest_mi = SelectKBest(score_func=mutual_info_regression, k=5)
kbest_mi.fit(X, y)
features_mi = X.columns[kbest_mi.get_support()]
features_mi

In [ ]:
# Apply correlation method
correlation = df.corr()
correlation_target = abs(correlation['purchase_price'])
relevant_features = correlation_target[correlation_target > 0.5]
relevant_features

## Correlation Plot

In [ ]:
# Plot correlation between independent features and target variable
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Plot')
plt.show()

## Visualizations and Other Plots for Analysis and Data Preprocessing

In [ ]:
# Distribution of purchase prices
plt.figure(figsize=(10, 6))
sns.histplot(df['purchase_price'], bins=30, kde=True)
plt.title('Distribution of Purchase Prices')
plt.xlabel('Purchase Price')
plt.ylabel('Frequency')
plt.show()

In [ ]:
# Boxplot of purchase prices by house type
plt.figure(figsize=(12, 6))
sns.boxplot(x='house_type', y='purchase_price', data=df)
plt.title('Boxplot of Purchase Prices by House Type')
plt.xlabel('House Type')
plt.ylabel('Purchase Price')
plt.show()

In [ ]:
# Scatter plot of sqm vs purchase price
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sqm', y='purchase_price', data=df)
plt.title('Scatter Plot of SQM vs Purchase Price')
plt.xlabel('SQM')
plt.ylabel('Purchase Price')
plt.show()