In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
df= pd.read_csv('final_dataset.csv')

In [3]:
df.head()
df.drop(['Unnamed: 0'], axis= 'columns', inplace= True)

In [4]:
df

Unnamed: 0,total_sqft,bath,balcony,price,bhk,area_type_le,location_le
0,1056.0,2.0,1.0,39.07,2,3,77
1,2600.0,5.0,3.0,120.00,4,2,59
2,1440.0,2.0,3.0,62.00,3,0,220
3,1521.0,3.0,1.0,95.00,3,3,155
4,1200.0,2.0,1.0,51.00,2,3,147
...,...,...,...,...,...,...,...
11849,1715.0,3.0,3.0,112.00,3,3,85
11850,3453.0,4.0,0.0,231.00,5,0,229
11851,1141.0,2.0,1.0,60.00,2,0,189
11852,4689.0,4.0,1.0,488.00,4,3,180


In [5]:
algo_list= {
    'linear_regression': {
        'model': LinearRegression(),
        'params': {
            'copy_X': [True, False],
            'fit_intercept': [True, False],
            'n_jobs': [None, 1, 2, 4],
            'positive': [False, True]
        }
    }, 
    'lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [1,2],
            'selection': ['random', 'cyclic'],
        }
    }, 
    'decision_tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'criterion': ['mse', 'friedman_mse'],
            'splitter': ['best', 'random']
        }
    }
}

In [6]:
X= df.drop('price', axis='columns')
y= df.price

In [7]:
sc= StandardScaler()
Xt= sc.fit_transform(X)

In [8]:
X = pd.DataFrame(Xt, columns=X.columns)

In [9]:
X

Unnamed: 0,total_sqft,bath,balcony,bhk,area_type_le,location_le
0,-0.414285,-0.464331,-0.729968,-0.654044,0.575031,-0.973011
1,0.884418,2.301586,1.745157,1.575763,-0.284109,-1.212006
2,-0.091291,-0.464331,1.745157,0.460860,-2.002388,0.925665
3,-0.023160,0.457642,-0.729968,0.460860,0.575031,0.062631
4,-0.293162,-0.464331,-0.729968,-0.654044,0.575031,-0.043589
...,...,...,...,...,...,...
11849,0.140019,0.457642,1.745157,0.460860,0.575031,-0.866792
11850,1.601901,1.379614,-1.967530,2.690667,-2.002388,1.045163
11851,-0.342789,-0.464331,-0.729968,-0.654044,-2.002388,0.514064
11852,2.641537,1.379614,-0.729968,1.575763,0.575031,0.394567


In [10]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.25)

In [11]:
scores= []
cv= ShuffleSplit(n_splits=5, random_state=0, test_size=0.25)

for algo_name, config in algo_list.items():
    grids= GridSearchCV(config['model'], config['params'], cv= cv, return_train_score= False)
    grids.fit(X_train, y_train)

    scores.append({
        'model': algo_name,
        'best_score': grids.best_score_,
        'best_params': grids.best_params_
    }  
    )

10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\rubal\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\rubal\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\rubal\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\rubal\AppData\Local\Programs\Python\Python311\Lib\

In [12]:
scores

[{'model': 'linear_regression',
  'best_score': 0.266672343386572,
  'best_params': {'copy_X': True,
   'fit_intercept': True,
   'n_jobs': None,
   'positive': True}},
 {'model': 'lasso',
  'best_score': 0.2744034264232048,
  'best_params': {'alpha': 2, 'selection': 'cyclic'}},
 {'model': 'decision_tree',
  'best_score': 0.23712191461562995,
  'best_params': {'criterion': 'friedman_mse', 'splitter': 'best'}}]

In [13]:
sorted_results = sorted(scores, key=lambda x: x['best_score'], reverse=True)
sorted_results[0]

{'model': 'lasso',
 'best_score': 0.2744034264232048,
 'best_params': {'alpha': 2, 'selection': 'cyclic'}}

In [14]:
mdl= LinearRegression(fit_intercept= True, n_jobs= None, positive= True)

In [15]:
mdl.fit(X_train, y_train)

In [16]:
with open('hechPredictor_model', 'wb') as model:
    pickle.dump(mdl, model)

In [17]:
y_predicted= mdl.predict(X_test)

In [18]:
y_predicted.dtype

dtype('float64')

In [19]:
y_test.dtype

dtype('float64')

In [20]:
from sklearn.metrics import mean_absolute_error

# Assuming y_predicted is the predicted values from your model
mae = mean_absolute_error(y_test, y_predicted)
print(f'Mean Absolute Error: {mae}')


Mean Absolute Error: 42.17216225896651


In [23]:
import json

column_data= {
    'data_columns': [cols.lower() for cols in X.columns]
}

with  open('columns.json', 'w') as filex:
    filex.write(json.dumps(column_data))