In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import xgboost as xgb
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('Pune house data.csv')
print(df)

# Data preprocessing
df.drop('society', axis=1, inplace=True)

# Function to convert different units of measurement to square feet
def convert_to_sqft(x):
    try:
        return float(x)
    except ValueError:
        return np.nan

# Apply the conversion function to 'total_sqft' column
df['total_sqft'] = df['total_sqft'].apply(convert_to_sqft)

# Drop rows with NaN values in 'total_sqft' column
df.dropna(subset=['total_sqft'], inplace=True)

# Impute missing values in numerical columns using mean strategy
num_imputer = SimpleImputer(strategy='mean')
df[['bath', 'balcony']] = num_imputer.fit_transform(df[['bath', 'balcony']])

# Impute missing values in categorical columns using most frequent strategy
cat_imputer = SimpleImputer(strategy='most_frequent')
df[['area_type', 'availability', 'size', 'site_location']] = cat_imputer.fit_transform(df[['area_type', 'availability', 'size', 'site_location']])

# One-hot encoding for categorical columns
df = pd.get_dummies(df, columns=['area_type', 'availability', 'size', 'site_location'], drop_first=True)

# Split the data into features (X) and target variable (y)
X = df.drop('price', axis=1)
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': xgb.XGBRegressor()
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'mse': mse, 'r2': r2}

# Print evaluation results
for name, metrics in results.items():
    print(f'{name}:')
    print(f'Mean Squared Error: {metrics["mse"]}')
    print(f'R-squared: {metrics["r2"]}')
    print('-------------------------')

# Interactive section for predicting house price
print('\nInput features to predict house price:')

# Area Type selection
print('Area Type Options:')
area_type_options = {
    1: 'Super built-up Area',
    2: 'Plot Area',
    3: 'Built-up Area',
}
for code, option in area_type_options.items():
    print(f'{code}: {option}')
area_type_code = int(input('Select Area Type (Enter the corresponding code): '))

# Availability selection
print('Availability Options:')
availability_options = {
    1: '19-Dec',
    2: 'Ready To Move',
}
for code, option in availability_options.items():
    print(f'{code}: {option}')
availability_code = int(input('Select Availability (Enter the corresponding code): '))

# Size selection
print('Size Options:')
size_options = {
    1: '2 BHK',
    2: '4 Bedroom',
    3: '3 BHK',
}
for code, option in size_options.items():
    print(f'{code}: {option}')
size_code = int(input('Select Size (Enter the corresponding code): '))

total_sqft = float(input('Total Square Feet: '))
bath = float(input('Number of Bathrooms: '))
balcony = float(input('Number of Balconies: '))

# Location selection
print('Location Options:')
location_options = {
    1: 'Alandi Road',
    2: 'Ambegaon Budruk',
    3: 'Anandnagar',
    4: 'Aundh',
    5: 'Aundh Road',
}
for code, option in location_options.items():
    print(f'{code}: {option}')
location_code = int(input('Select Location (Enter the corresponding code): '))

# Create a DataFrame with the input features
# Create a DataFrame with the input features
input_data = {
    'area_type_Super built-up Area': [1] if area_type_code == 1 else [0],
    'area_type_Plot Area': [1] if area_type_code == 2 else [0],
    'area_type_Built-up Area': [1] if area_type_code == 3 else [0],
    'availability_19-Dec': [1] if availability_code == 1 else [0],
    'availability_Ready To Move': [1] if availability_code == 2 else [0],
    'size_2 BHK': [1] if size_code == 1 else [0],
    'size_4 Bedroom': [1] if size_code == 2 else [0],
    'size_3 BHK': [1] if size_code == 3 else [0],
    'total_sqft': [total_sqft],
    'bath': [bath],
    'balcony': [balcony],
    'site_location_Alandi Road': [1] if location_code == 1 else [0],
    'site_location_Ambegaon Budruk': [1] if location_code == 2 else [0],
    'site_location_Anandnagar': [1] if location_code == 3 else [0],
    'site_location_Aundh': [1] if location_code == 4 else [0],
    'site_location_Aundh Road': [1] if location_code == 5 else [0],
}

# Provide an index for the DataFrame
input_df = pd.DataFrame(input_data, index=[0])

# Use the trained model to predict the house price
predictions = {}
for name, model in models.items():
    predicted_price = model.predict(input_df)
    predictions[name] = predicted_price[0]


# Visualize predicted prices across different models
plt.figure(figsize=(10, 6))
plt.bar(predictions.keys(), predictions.values())
plt.xlabel('Models')
plt.ylabel('Predicted Price (in lakhs)')
plt.title('Predicted House Price Across Models')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


                  area_type   availability       size  society total_sqft  \
0      Super built-up  Area         19-Dec      2 BHK  Coomee        1056   
1                Plot  Area  Ready To Move  4 Bedroom  Theanmp       2600   
2            Built-up  Area  Ready To Move      3 BHK      NaN       1440   
3      Super built-up  Area  Ready To Move      3 BHK  Soiewre       1521   
4      Super built-up  Area  Ready To Move      2 BHK      NaN       1200   
...                     ...            ...        ...      ...        ...   
13315        Built-up  Area  Ready To Move  5 Bedroom  ArsiaEx       3453   
13316  Super built-up  Area  Ready To Move      4 BHK      NaN       3600   
13317        Built-up  Area  Ready To Move      2 BHK  Mahla T       1141   
13318  Super built-up  Area         18-Jun      4 BHK  SollyCl       4689   
13319  Super built-up  Area  Ready To Move      1 BHK      NaN        550   

       bath  balcony   price         site_location  
0       2.0      1.0  