In [3]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

In [4]:
# import data
data= pd.read_csv("housing.csv") 
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [5]:
output_data=data["median_house_value"]
features= ["total_rooms","total_bedrooms","ocean_proximity","housing_median_age"]
input_data=data[features]

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
 9   median_house_value  20640 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


In [7]:
data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [8]:
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

In [9]:
data["total_bedrooms"].unique()

array([ 129., 1106.,  190., ..., 3008., 1857., 1052.])

In [10]:
data["total_bedrooms"].value_counts()

280.0     55
331.0     51
345.0     50
393.0     49
343.0     49
          ..
2205.0     1
1448.0     1
1691.0     1
2537.0     1
2546.0     1
Name: total_bedrooms, Length: 1923, dtype: int64

In [11]:
# non value filling with 280.0 its frequently occured  
data["total_bedrooms"].mode()[0]

280.0

In [12]:
data["total_bedrooms"]=data["total_bedrooms"].fillna(data["total_bedrooms"].mode()[0])
data["total_bedrooms"]

0         129.0
1        1106.0
2         190.0
3         235.0
4         280.0
          ...  
20635     374.0
20636     150.0
20637     485.0
20638     409.0
20639     616.0
Name: total_bedrooms, Length: 20640, dtype: float64

In [13]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [14]:
data.isnull().sum() # converted all none value into most frequent value

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
median_house_value    0
dtype: int64

In [15]:
# encode the data
catogary = ['housing_median_age','total_rooms','total_bedrooms','population','households','median_income',
            'ocean_proximity']

In [16]:
for field in catogary:
    data[field]=pd.get_dummies(data[field])
data[field].head()

0    0
1    0
2    0
3    0
4    0
Name: ocean_proximity, dtype: uint8

In [17]:
data=data.drop(["median_house_value"],axis=1)

In [18]:
X,y=data.values,output_data.values
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)
y

array([3665, 3074, 3017, ...,  508,  432,  479])

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train,x_test,y_train,y_test=train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=42)

In [21]:
from sklearn.linear_model import LinearRegression
lin=LinearRegression()

In [22]:
lin.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [23]:
lin.coef_

array([-621.87535401, -587.03891755,  -48.19398298,  335.67944687,
        901.60346838, 1036.03679017,  901.60346838, -235.3501304 ,
        247.58814306])

In [24]:
lin.intercept_

-51957.737109022266

In [25]:
lin.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [26]:
lin.coef_

array([-616.57109388, -581.83829685,  185.07454139,  329.15813901,
        904.12554635, 1034.71425085,  904.12554635, -467.79326722,
        251.92186983])

In [27]:
lin.intercept_

-51509.54085540584

In [28]:
pred= lin.predict(x_test)


In [29]:
import math 
from sklearn.metrics import mean_squared_error
print(math.sqrt(mean_squared_error(pred,y_test)))

860.9897520764083


In [30]:
from sklearn.tree import DecisionTreeClassifier
print(x_train.shape)

(16512, 9)


In [31]:
cls=DecisionTreeClassifier()

In [32]:
cls.fit(x_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [33]:
y_pred=cls.predict(x_test)

In [34]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred,y_test)

In [35]:
accuracy

0.02882751937984496

In [None]:
from sklearn.ensemble import RandomForestClassifier
cls=RandomForestClassifier()

In [None]:
cls.fit(x_train,y_train)

In [None]:
y_pred= cls.predict(x_test)

In [None]:
accuracy =accuracy_score(y_pred,y_test)