In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

# Load your data
data = pd.read_csv('Bengaluru_House_Data.csv')

# Drop unnecessary columns
data.drop(columns=['area_type', 'availability', 'society', 'balcony'], inplace=True)

# Handle missing values
data['location'].fillna('Sarjapur Road', inplace=True)
data['size'].fillna('2 BHK', inplace=True)
data['bath'].fillna(data['bath'].median(), inplace=True)

# Convert 'total_sqft' to numeric and handle ranges
def convertRange(x):
    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1])) / 2
    try:
        return float(x)
    except:
        return None

data['total_sqft'] = data['total_sqft'].apply(convertRange)

# Create a new column 'bhk'
data['bhk'] = data['size'].str.split().str.get(0).astype(int)

# Check if 'balcony' column exists before filling missing values
if 'balcony' in data.columns:
    data['balcony'].fillna(data['balcony'].median(), inplace=True)

# Cleaned data
data = data[((data['total_sqft'] / data['bhk']) >= 300)]
data['price_per_sqft'] = data['price'] / data['total_sqft']

# Function to remove outliers based on price_per_sqft
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        gen_df = subdf[(subdf.price_per_sqft > (m - st)) & (subdf.price_per_sqft <= (m + st))]
        df_output = pd.concat([df_output, gen_df], ignore_index=True)
    return df_output

data = remove_outliers_sqft(data)

# Function to remove outliers based on bhk
def bhk_outlier_remover(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }

        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk - 1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < (stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')

data = bhk_outlier_remover(data)

# Drop unnecessary columns
data.drop(columns=['size', 'price_per_sqft'], inplace=True)

# Save cleaned data
data.to_csv("cleaned_data.csv")

# Split the data
X = data.drop(columns=['price'])
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define column transformer
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['location']),
    (SimpleImputer(strategy='mean'), ['total_sqft', 'bath', 'bhk']),  # Impute missing values for numeric columns
    remainder='passthrough'
)

# Create the pipeline with Ridge regression
scaler = StandardScaler()
ridge = Ridge()
pipe = make_pipeline(column_trans, scaler, ridge)

# Fit the pipeline
pipe.fit(X_train, y_train)

# Make predictions
y_pred_ridge = pipe.predict(X_test)

# Evaluate Ridge regression
print("Ridge : ", r2_score(y_test, y_pred_ridge))

# Save the model
import pickle
pickle.dump(pipe, open('RidgeModel.pkl', 'wb'))


Ridge :  0.8102520574062685
