## Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load data in pandas frame

In [2]:
df = pd.read_csv('/content/drive/My Drive/machine learning/regression/data/housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


## Prepaire the Data for Train the Model

In [4]:
df.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [6]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

here we can see that data is missing

so drop or fill with median value

In [7]:
mean = df.total_bedrooms.mean()

In [8]:
mean

537.8705525375618

In [9]:
df['total_bedrooms'] = df['total_bedrooms'].fillna(mean)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [11]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

## we are using encoding for ocean_proximity

In [12]:
df1= df.loc[:,:].values

In [13]:
df1

array([[-122.23, 37.88, 41.0, ..., 8.3252, 452600.0, 'NEAR BAY'],
       [-122.22, 37.86, 21.0, ..., 8.3014, 358500.0, 'NEAR BAY'],
       [-122.24, 37.85, 52.0, ..., 7.2574, 352100.0, 'NEAR BAY'],
       ...,
       [-121.22, 39.43, 17.0, ..., 1.7, 92300.0, 'INLAND'],
       [-121.32, 39.43, 18.0, ..., 1.8672, 84700.0, 'INLAND'],
       [-121.24, 39.37, 16.0, ..., 2.3886, 89400.0, 'INLAND']],
      dtype=object)

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import  ColumnTransformer

In [15]:
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[9])],remainder='passthrough')

In [16]:
df1 = ct.fit_transform(df1)

In [17]:
ct.named_transformers_

{'encoder': OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
               handle_unknown='error', sparse=True),
 'remainder': 'passthrough'}

In [18]:
ct.get_feature_names

<bound method ColumnTransformer.get_feature_names of ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('encoder',
                                 OneHotEncoder(categories='auto', drop=None,
                                               dtype=<class 'numpy.float64'>,
                                               handle_unknown='error',
                                               sparse=True),
                                 [9])],
                  verbose=False)>

In [27]:
df1

array([[0.0, 0.0, 0.0, ..., 126.0, 8.3252, 452600.0],
       [0.0, 0.0, 0.0, ..., 1138.0, 8.3014, 358500.0],
       [0.0, 0.0, 0.0, ..., 177.0, 7.2574, 352100.0],
       ...,
       [0.0, 1.0, 0.0, ..., 433.0, 1.7, 92300.0],
       [0.0, 1.0, 0.0, ..., 349.0, 1.8672, 84700.0],
       [0.0, 1.0, 0.0, ..., 530.0, 2.3886, 89400.0]], dtype=object)

## feature or target

In [19]:
df2 = pd.DataFrame(df1)

In [20]:
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,0,0,1,0,-122.23,37.88,41,880,129,322,126,8.3252,452600
1,0,0,0,1,0,-122.22,37.86,21,7099,1106,2401,1138,8.3014,358500
2,0,0,0,1,0,-122.24,37.85,52,1467,190,496,177,7.2574,352100
3,0,0,0,1,0,-122.25,37.85,52,1274,235,558,219,5.6431,341300
4,0,0,0,1,0,-122.25,37.85,52,1627,280,565,259,3.8462,342200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,0,1,0,0,0,-121.09,39.48,25,1665,374,845,330,1.5603,78100
20636,0,1,0,0,0,-121.21,39.49,18,697,150,356,114,2.5568,77100
20637,0,1,0,0,0,-121.22,39.43,17,2254,485,1007,433,1.7,92300
20638,0,1,0,0,0,-121.32,39.43,18,1860,409,741,349,1.8672,84700


In [21]:
df2 = df2.drop(columns=[5,6])

In [22]:
df2.head()

Unnamed: 0,0,1,2,3,4,7,8,9,10,11,12,13
0,0,0,0,1,0,41,880,129,322,126,8.3252,452600
1,0,0,0,1,0,21,7099,1106,2401,1138,8.3014,358500
2,0,0,0,1,0,52,1467,190,496,177,7.2574,352100
3,0,0,0,1,0,52,1274,235,558,219,5.6431,341300
4,0,0,0,1,0,52,1627,280,565,259,3.8462,342200


In [23]:
X = df2.iloc[:,:-1].values
y = df2.iloc[:,-1].values

In [24]:
X

array([[0.0, 0.0, 0.0, ..., 322.0, 126.0, 8.3252],
       [0.0, 0.0, 0.0, ..., 2401.0, 1138.0, 8.3014],
       [0.0, 0.0, 0.0, ..., 496.0, 177.0, 7.2574],
       ...,
       [0.0, 1.0, 0.0, ..., 1007.0, 433.0, 1.7],
       [0.0, 1.0, 0.0, ..., 741.0, 349.0, 1.8672],
       [0.0, 1.0, 0.0, ..., 1387.0, 530.0, 2.3886]], dtype=object)

In [25]:
y.shape

(20640,)

## Train or test split the Data

In [26]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [28]:
X_train.shape

(16512, 11)

In [29]:
X_test.shape

(4128, 11)

In [30]:
y_test.shape

(4128,)

##Scale the data

In [31]:
from sklearn.preprocessing import StandardScaler
std =StandardScaler()

In [32]:
X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)

## Lets train train the model 

In [33]:
from sklearn.svm import SVR

In [33]:
svm = SVR(C=1000,degree=1,kernel='linear')

In [34]:
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)

In [35]:
y_pred.shape

(4128,)

Lets compare the values y perdictt or y test

In [36]:
print(np.concatenate((y_test.reshape(-1,1),y_pred.reshape(-1,1)),axis=1))

[[47700.0 56166.98498823264]
 [45800.0 95993.18851854604]
 [500001.0 249996.20612588144]
 ...
 [500001.0 417983.4995204832]
 [72300.0 112505.38577325971]
 [151500.0 178411.97663122055]]


In [37]:
from sklearn.metrics import  mean_absolute_error,mean_squared_error,r2_score

In [38]:
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.6039194473499944
5190275122.991848
50196.78441809772


## now increase the C value

In [39]:
svm = SVR(C=10000,degree=1,kernel='linear')
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)

In [40]:
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.6037181998751091
5192912287.971228
50258.06601120212


In [41]:
svm = SVR(C=100000,degree=1,kernel='linear')
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.6035845602128527
5194663514.103927
50267.4358952392


## we are adding degree

In [42]:
svm = SVR(C=10000,degree=3,kernel='linear')
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.6037181998751091
5192912287.971228
50258.06601120212


##kernel

###we are adding polynomial kernels

In [43]:
svm = SVR(C=1000,degree=3,kernel='poly',)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.5356588062817073
6084768692.155477
52468.24272540334


In [44]:
svm = SVR(C=500,degree=3,kernel='poly',)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.4769560488609047
6854014895.872649
56905.540894279235


In [45]:
svm = SVR(C=100000,degree=3,kernel='poly',)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.5990650825348529
5253887155.366028
46674.685892345035


###kernel rbf

In [46]:
svm = SVR(C=1000,degree=3,kernel='rbf',)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.6114538506523486
5091543625.660919
49170.780291448515


In [47]:
svm = SVR(C=10000,degree=3,kernel='rbf',)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.6831717532228228
4151745791.371365
43818.11341408086


In [48]:
svm = SVR(C=50000,degree=3,kernel='rbf',)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.6985317403304174
3950467141.2568
42560.49311590079


In [49]:
svm = SVR(C=100000,degree=3,kernel='rbf',)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.7008630094173517
3919917981.8346586
42239.62082659239


In [50]:
svm = SVR(C=500000,degree=3,kernel='rbf',)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.7047222652504448
3869345946.9045205
41806.06546219068


In [51]:
svm = SVR(C=1000000,degree=3,kernel='rbf',)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.7064269940223139
3847007028.023292
41700.077507179834


here we get good score than linear

In [54]:
svm = SVR(C=100000,degree=5,kernel='rbf',)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.7008630094173517
3919917981.8346586
42239.62082659239
