In [317]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
import numpy as np
import pickle

In [318]:
df = pd.read_csv("newdata.csv")

In [319]:
df.info(0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7738 entries, 0 to 7737
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   price             7738 non-null   int64  
 1   area              7738 non-null   int64  
 2   bedrooms          7738 non-null   int64  
 3   bathrooms         7738 non-null   int64  
 4   balcony           5166 non-null   float64
 5   status            7164 non-null   object 
 6   neworold          7738 non-null   object 
 7   parking           2612 non-null   float64
 8   furnished_status  4124 non-null   object 
 9   lift              1733 non-null   float64
 10  type_of_building  7738 non-null   object 
 11  price_sqft        7738 non-null   float64
 12  city              7738 non-null   object 
dtypes: float64(4), int64(4), object(5)
memory usage: 786.0+ KB


In [320]:
train_set,test_set = train_test_split(df,test_size=0.2,random_state=42)

In [321]:
print("tarin dateset size = ",train_set.size)
print("test dateset size = ",test_set.size)
print("whole dateset size = ",df.size)

tarin dateset size =  80470
test dateset size =  20124
whole dateset size =  100594


In [322]:
train_set.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'balcony', 'status',
       'neworold', 'parking', 'furnished_status', 'lift', 'type_of_building',
       'price_sqft', 'city'],
      dtype='object')

In [323]:
def data_transformer(train_set,test_set):
    train_features = train_set.drop(columns=["price"])
    train_target = train_set["price"]
    test_features = test_set.drop(columns=["price"])
    test_target = test_set["price"]

    num_cols = ['area','bedrooms','bathrooms','balcony','parking','lift','price_sqft']
    cat_cols = ['status','neworold','furnished_status','type_of_building','city']

    num_pipeline = Pipeline([("imputer",SimpleImputer(strategy="median")),("scaler",StandardScaler())])#here it take list of tuples
    cat_pipeline = Pipeline([("imputer",SimpleImputer(strategy="most_frequent")),("ohe",OneHotEncoder()),("scaler",StandardScaler(with_mean=False))])
    
    preprocessor = ColumnTransformer([("num_pipeline",num_pipeline,num_cols),("cat_pipeline",cat_pipeline,cat_cols)])

    transformed_train_feature = preprocessor.fit_transform(train_features)

    transformed_test_feature = preprocessor.transform(test_features)

    with open("transformer.pkl","wb") as f:
        pickle.dump(preprocessor,f)

    return transformed_train_feature,np.array(train_target),transformed_test_feature,np.array(test_target)


In [324]:
x_train,y_train,x_test,y_test = data_transformer(train_set,test_set)

In [325]:
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

def model_trainer(x_train,y_train,x_test,y_test):
    report = {}
    models = {"Linear Regression":LinearRegression(),
          "Decision Tree":DecisionTreeRegressor(),
          "XG boost":XGBRegressor(),
          "Ada boost":AdaBoostRegressor(),
          "Gradient boost":GradientBoostingRegressor(),
          "Random Forest":RandomForestRegressor()
          }
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model_name = list(models.keys())[i]
        model.fit(x_train,y_train)
        y_pred = model.predict(x_test)
        r2 = r2_score(y_test,y_pred)
        report[model_name]=r2
        print("r2 scor for ",model_name,"=",r2)
    #print(max(report.values()))
    key = None
    for i in report:
        if report[i]==max(report.values()):
            key=i
            break
    print(report[key])
    best_model = models[key]
    best_model.fit(x_train,y_train)
    with open("best_model.pkl","wb") as f:
        pickle.dump(best_model,f)

In [326]:
model_trainer(x_train,y_train,x_test,y_test)

r2 scor for  Linear Regression = 0.9168428547413072
r2 scor for  Decision Tree = 0.9911541876320484
r2 scor for  XG boost = 0.9898467659950256
r2 scor for  Ada boost = 0.8641949861362633
r2 scor for  Gradient boost = 0.9940988423305781
r2 scor for  Random Forest = 0.9956833462227336
0.9956833462227336


In [327]:
"""def model(x_train,y_train):
    lr = LinearRegression()
    lr.fit(x_train,y_train)
    return lr"""

'def model(x_train,y_train):\n    lr = LinearRegression()\n    lr.fit(x_train,y_train)\n    return lr'

In [328]:
"""lr = model(x_train,y_train)
y_pred = lr.predict(x_test)"""

'lr = model(x_train,y_train)\ny_pred = lr.predict(x_test)'

In [329]:
"""from sklearn.metrics import r2_score


print("r2score  = ",r2_score(y_test,y_pred))

with open("linear_regression.pkl","wb") as f:
    pickle.dump(lr,f)"""


'from sklearn.metrics import r2_score\n\n\nprint("r2score  = ",r2_score(y_test,y_pred))\n\nwith open("linear_regression.pkl","wb") as f:\n    pickle.dump(lr,f)'

In [330]:
df.head(5)

Unnamed: 0,price,area,bedrooms,bathrooms,balcony,status,neworold,parking,furnished_status,lift,type_of_building,price_sqft,city
0,5600000,1350,3,3,,Under Construction,New Property,,,2.0,Flat,4148.148148,Noida
1,8800000,1490,3,3,,Ready to Move,New Property,,Semi-Furnished,2.0,Flat,5906.040268,Gurgaon
2,16500000,2385,4,5,,Ready to Move,New Property,1.0,Unfurnished,,Flat,6918.238994,Ghaziabad
3,3810000,1050,2,2,3.0,,New Property,1.0,Unfurnished,2.0,Flat,3628.571429,Greater Noida
4,6200000,1350,2,2,3.0,Ready to Move,Resale,1.0,,3.0,Flat,4592.592593,Noida


In [331]:

#col = ["area","bedrooms","bathrooms","balcony","status","neworold","parking","furnished_status","lift","type_of_building","price_sqft","city"]

#new_data = pd.DataFrame([[1200,4,2,2,"Ready to Move","New Property",1,"Furnished",1,"Flat",3800,"New Delhi"]],columns=col)


new_data = {
    "area":1500,
    "bedrooms":4,
    "bathrooms":3,
    "balcony":2,
    "status":"Ready to Move",
    "neworold":"New Property",
    "parking":2,
    "furnished_status":"Furnished",
    "lift":2,
    "type_of_building":"Flat",
    "price_sqft":4500,
    "city":"New Delhi",
}
new_data_df = pd.DataFrame([new_data])

In [332]:
"""with open("transformer.pkl","rb") as f:
    trans = pickle.load(f)

    
with open("best_model.pkl","rb") as f:
    loaded_model = pickle.load(f)"""

'with open("transformer.pkl","rb") as f:\n    trans = pickle.load(f)\n\n    \nwith open("best_model.pkl","rb") as f:\n    loaded_model = pickle.load(f)'

In [333]:
def predicted_data(dict):
    with open("transformer.pkl","rb") as f:
        trans = pickle.load(f)
    
    with open("best_model.pkl","rb") as f:
        loaded_model = pickle.load(f)

    scaled_data = trans.transform(dict)
    y_pred = loaded_model.predict(scaled_data)
    return y_pred


In [334]:
predicted_data(new_data_df)

array([6799000.])

In [337]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7738 entries, 0 to 7737
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   price             7738 non-null   int64  
 1   area              7738 non-null   int64  
 2   bedrooms          7738 non-null   int64  
 3   bathrooms         7738 non-null   int64  
 4   balcony           5166 non-null   float64
 5   status            7164 non-null   object 
 6   neworold          7738 non-null   object 
 7   parking           2612 non-null   float64
 8   furnished_status  4124 non-null   object 
 9   lift              1733 non-null   float64
 10  type_of_building  7738 non-null   object 
 11  price_sqft        7738 non-null   float64
 12  city              7738 non-null   object 
dtypes: float64(4), int64(4), object(5)
memory usage: 786.0+ KB


In [346]:
df["furnished_status"].value_counts()

furnished_status
Semi-Furnished    2199
Unfurnished       1230
Furnished          695
Name: count, dtype: int64