In [8]:
# Load the dataset
# Assuming you have a CSV file named 'house_prices.csv' with the data
data = pd.read_csv('/kaggle/input/housing-prices-dataset/Housing.csv')

# Display first few rows of the dataset
print(data.head())

# Check for missing values
print(data.isnull().sum())


      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  
price               0
area                0
bedrooms            0
bathrooms           0
stories    

In [9]:
# Separate features and target variable
X = data.drop(columns='price')
y = data['price']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define categorical and numerical features
categorical_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning']
numerical_features = ['area', 'bedrooms', 'bathrooms', 'stories']

# Create transformers for numerical and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

# Use ColumnTransformer to apply the transformations to the respective columns
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

# Define the model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

In [10]:
# Transform the data
X_transformed = preprocessor.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Create a multiple linear regression model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate mean squared error and R-squared
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r_squared}")

Mean Squared Error: 1997147293758.5425
R-squared: 0.6048828690833066


In [14]:
# Import necessary libraries
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.weightstats import ztest as ztest
from scipy.stats import ttest_ind
from statsmodels.stats.anova import AnovaRM
from scipy.stats import f_oneway


# Load the dataset
data = pd.read_csv('/kaggle/input/housing-prices-dataset/Housing.csv')

# Z-Test
# Compare mean prices for houses connected to main road vs. not connected
mainroad_groups = data.groupby('mainroad')['price']
z_stat, z_p_value = ztest(mainroad_groups.get_group('yes'), mainroad_groups.get_group('no'))
print(f"Z-Test: z-statistic = {z_stat}, p-value = {z_p_value}")

# T-Test
# Compare mean prices for houses with guestroom vs. without guestroom
guestroom_groups = data.groupby('guestroom')['price']
t_stat, t_p_value = ttest_ind(guestroom_groups.get_group('yes'), guestroom_groups.get_group('no'))
print(f"T-Test: t-statistic = {t_stat}, p-value = {t_p_value}")

# Group the data by 'stories' and get the list of prices for each group
grouped_prices = data.groupby('stories')['price'].apply(list)

# Perform one-way ANOVA
f_stat, p_value = f_oneway(*grouped_prices)

print(f"ANOVA: F-statistic = {f_stat}, p-value = {p_value}")


Z-Test: z-statistic = 7.245125201307269, p-value = 4.320399162131221e-13
T-Test: t-statistic = 6.158593658077527, p-value = 1.4291136385580494e-09
ANOVA: F-statistic = 41.78230944700161, p-value = 2.6832459713031025e-24
