In [1]:
import pandas as pd

In [2]:
# load train and test datasets
train_data = pd.read_csv("Data/Train.csv")
test_data = pd.read_csv("Data/Test.csv")

In [3]:
# shape of both datasets
print(f"shape of train data{train_data.shape}")
print(f"shape of test data{test_data.shape}")

shape of train data(13320, 9)
shape of test data(1480, 9)


In [4]:
# top 10 values from train and test data
print(f"top 10 values of train data \n {train_data.head()}")
print(f"top 10 values of test data \n {test_data.head()}")

top 10 values of train data 
               area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up  Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up  Area  Ready To Move                  Kothanur      2 BHK   

   society total_sqft  bath  balcony   price  
0  Coomee        1056   2.0      1.0   39.07  
1  Theanmp       2600   5.0      3.0  120.00  
2      NaN       1440   2.0      3.0   62.00  
3  Soiewre       1521   3.0      1.0   95.00  
4      NaN       1200   2.0      1.0   51.00  
top 10 values of test data 
               area_type   availability           location       size  society  \
0  Super built-up  Area  Ready To Move        Brookefield      2 BHK  Roeekbl   
1            Plot  Are

In [5]:
# check is there any nan values in any columns
print(f"check NaN values are present in data \n {train_data.isnull().sum()}")

check NaN values are present in data 
 area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64


In [6]:
# check is there any nan values in any columns of test data
print(f"check NaN values are present in data \n {test_data.isnull().sum()}")

check NaN values are present in data 
 area_type          0
availability       0
location           0
size               2
society          626
total_sqft         0
bath               7
balcony           69
price           1480
dtype: int64


In [7]:
# remove price column from both datasets
x_train = train_data.drop("price",axis=1)
y_train = train_data.price.values
x_test = test_data.drop("price", axis=1)
y_test = test_data.price.values

In [8]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((13320, 8), (1480, 8), (13320,), (1480,))

In [9]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 8 columns):
area_type       13320 non-null object
availability    13320 non-null object
location        13319 non-null object
size            13304 non-null object
society         7818 non-null object
total_sqft      13320 non-null object
bath            13247 non-null float64
balcony         12711 non-null float64
dtypes: float64(2), object(6)
memory usage: 832.6+ KB


In [10]:
# find categorical and numerical columns
column_names = x_train.columns
print(list(column_names))
num_cols = x_train._get_numeric_data().columns
print(list(num_cols))
cat_col = list(set(column_names) - set(num_cols))
print(cat_col)

['area_type', 'availability', 'location', 'size', 'society', 'total_sqft', 'bath', 'balcony']
['bath', 'balcony']
['area_type', 'size', 'availability', 'total_sqft', 'location', 'society']


In [11]:
full_data = pd.concat([x_train,x_test],axis=0)

In [12]:
full_data.shape

(14800, 8)

In [13]:
full_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0


In [14]:
full_data.isnull().sum()

area_type          0
availability       0
location           1
size              18
society         6128
total_sqft         0
bath              80
balcony          678
dtype: int64

In [15]:
full_data = full_data.drop("society", axis=1)

In [16]:
# find categorical and numerical columns
column_names = full_data.columns
print(list(column_names))
num_cols = full_data._get_numeric_data().columns
print(list(num_cols))
cat_col = list(set(column_names) - set(num_cols))
print(cat_col)

['area_type', 'availability', 'location', 'size', 'total_sqft', 'bath', 'balcony']
['bath', 'balcony']
['area_type', 'size', 'availability', 'total_sqft', 'location']


In [17]:
# fill NaN values in cat columns with NONE on full_data
for i in cat_col:
    full_data[i] = full_data[i].fillna("NONE")

In [18]:
# replace NaN values in numerical columns with mean of that column in x_train data
for i in num_cols:
    mean_val = full_data[i].mean()
    print(mean_val)
    full_data[i] = full_data[i].fillna(value = mean_val)
#     x_train[i] = x_train[i].fillna(value = x_train.mean())

2.6987092391304346
1.582141339753576


In [19]:
full_data.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0


In [20]:
full_data.isnull().sum()

area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
dtype: int64

In [21]:
full_data["bath"].unique()

array([ 2.        ,  5.        ,  3.        ,  4.        ,  6.        ,
        1.        ,  9.        ,  2.69870924,  8.        ,  7.        ,
       11.        , 10.        , 14.        , 27.        , 12.        ,
       16.        , 40.        , 15.        , 13.        , 18.        ])

In [22]:
full_data.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0


In [23]:
# apply label encoding to all categorical columns in full_data data
from sklearn.preprocessing import LabelEncoder
lab_enc = LabelEncoder()
for i in cat_col:
    lab_enc.fit(full_data[i])
    full_data[i] = lab_enc.transform(full_data[i])

In [24]:
full_data.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony
0,3,41,430,14,72,2.0,1.0
1,2,81,325,20,1330,5.0,3.0
2,0,81,1220,17,524,2.0,3.0
3,3,81,778,17,615,3.0,1.0
4,3,81,736,14,246,2.0,1.0


In [25]:
train_size = x_train.shape[0]
test_size = x_test.shape[0]

In [26]:
x_train = full_data[:train_size]
x_test = full_data[train_size:]

In [27]:
x_train.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony
0,3,41,430,14,72,2.0,1.0
1,2,81,325,20,1330,5.0,3.0
2,0,81,1220,17,524,2.0,3.0
3,3,81,778,17,615,3.0,1.0
4,3,81,736,14,246,2.0,1.0


In [28]:
x_test.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony
0,3,81,284,14,281,2.0,2.0
1,2,81,103,31,1246,9.0,2.0
2,2,27,534,20,753,5.0,2.0
3,3,81,721,17,391,3.0,1.0
4,3,81,727,14,202,2.0,1.0


In [29]:
# build linear regression model 
from sklearn.linear_model import LinearRegression
linear_re = LinearRegression()
# fit model on train data
linear_re.fit(x_train, y_train)

LinearRegression()

In [30]:
predicts = linear_re.predict(x_test)

In [31]:
y_list = list(predicts)

In [32]:
pred_df = pd.DataFrame({"price":y_list})

In [33]:
test_data_sum = pd.read_csv("Data/Test.csv")

In [34]:
test_data_sum.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,Ready To Move,Brookefield,2 BHK,Roeekbl,1225,2.0,2.0,
1,Plot Area,Ready To Move,Akshaya Nagar,9 Bedroom,,2400,9.0,2.0,
2,Plot Area,18-Apr,Hennur Road,4 Bedroom,Saandtt,1650,5.0,2.0,
3,Super built-up Area,Ready To Move,Kodichikkanahalli,3 BHK,Winerri,1322,3.0,1.0,
4,Super built-up Area,Ready To Move,Konanakunte,2 BHK,AmageSa,1161,2.0,1.0,


In [35]:
test_data_sum.update(pred_df)

In [37]:
test_data_sum.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,Ready To Move,Brookefield,2 BHK,Roeekbl,1225,2.0,2.0,59.671178
1,Plot Area,Ready To Move,Akshaya Nagar,9 Bedroom,,2400,9.0,2.0,428.050479
2,Plot Area,18-Apr,Hennur Road,4 Bedroom,Saandtt,1650,5.0,2.0,219.680061
3,Super built-up Area,Ready To Move,Kodichikkanahalli,3 BHK,Winerri,1322,3.0,1.0,109.841452
4,Super built-up Area,Ready To Move,Konanakunte,2 BHK,AmageSa,1161,2.0,1.0,53.18943


In [38]:
test_data_sum.to_csv("Test_sub_bin.csv")

In [39]:
# build ridge regression model in x_train and y_train
from sklearn.linear_model import Ridge
ridge_re = Ridge(alpha=1.0)
# fit model
ridge_re.fit(x_train, y_train)

Ridge()

In [40]:
ridge_predicts = ridge_re.predict(x_test)

In [41]:
ridge_predicts[:6]

array([ 59.67135111, 428.04048106, 219.67518078, 109.84088883,
        53.18990466, 181.08978165])

In [42]:
pred_df_ridge = pd.DataFrame({"price":list(ridge_predicts)})

In [43]:
b

NameError: name 'b' is not defined

In [44]:
submission_ridge.head()

NameError: name 'submission_ridge' is not defined

In [45]:
submission_ridge.update(pred_df_ridge)

NameError: name 'submission_ridge' is not defined

In [42]:
submission_ridge.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,Ready To Move,Brookefield,2 BHK,Roeekbl,1225,2.0,2.0,58.529372
1,Plot Area,Ready To Move,Akshaya Nagar,9 Bedroom,,2400,9.0,2.0,428.071538
2,Plot Area,18-Apr,Hennur Road,4 Bedroom,Saandtt,1650,5.0,2.0,218.364235
3,Super built-up Area,Ready To Move,Kodichikkanahalli,3 BHK,Winerri,1322,3.0,1.0,106.324235
4,Super built-up Area,Ready To Move,Konanakunte,2 BHK,AmageSa,1161,2.0,1.0,56.418646


In [43]:
submission_ridge.to_csv("submission_ridge_bin.csv")

In [46]:
# build decision tree model
from sklearn.tree import DecisionTreeRegressor
dt_reg = DecisionTreeRegressor(random_state=42)
# fit model
dt_reg.fit(x_train, y_train)

DecisionTreeRegressor(random_state=42)

In [51]:
dt_predicts = dt_reg.predict(x_test)
submission_dt = pd.read_csv("Data/Test.csv")
pred_df_dt = pd.DataFrame({"price":list(dt_predicts)})
submission_dt.update(pred_df_dt)
submission_dt.to_csv("submission_decision_tree.csv")

In [52]:
# build random forest regressor model
from sklearn.ensemble import RandomForestRegressor
random_f_reg = RandomForestRegressor(random_state=42)
# fit model
random_f_reg.fit(x_train, y_train)

RandomForestRegressor(random_state=42)

In [53]:
rfreg_predicts = random_f_reg.predict(x_test)
submission_random = pd.read_csv("Data/Test.csv")
pred_df_random = pd.DataFrame({"price":list(rfreg_predicts)})
submission_random.update(pred_df_random)
submission_random.to_csv("submission_random_forest.csv")

In [None]:
# build gradient boosting algorithm
from sklearn.ensemble import GradientBoostingRegressor
reg = GradientBoostingRegressor(random_state=0)
