In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.preprocessing import Binarizer,LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso

pd.set_option('display.max_columns',None)

In [2]:
normalizer=Normalizer()
minmaxScaler=MinMaxScaler()
standardScaler=StandardScaler()

In [3]:
data=pd.read_csv('USA_Housing.csv')
data

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1.059034e+06,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.002900,6.730821,3.09,40173.072174,1.505891e+06,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.865890,8.512727,5.13,36882.159400,1.058988e+06,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1.260617e+06,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,6.309435e+05,USNS Raymond\nFPO AE 09386
...,...,...,...,...,...,...,...
4995,60567.944140,7.830362,6.137356,3.46,22837.361035,1.060194e+06,USNS Williams\nFPO AP 30153-7653
4996,78491.275435,6.999135,6.576763,4.02,25616.115489,1.482618e+06,"PSC 9258, Box 8489\nAPO AA 42991-3352"
4997,63390.686886,7.250591,4.805081,2.13,33266.145490,1.030730e+06,"4215 Tracy Garden Suite 076\nJoshualand, VA 01..."
4998,68001.331235,5.534388,7.130144,5.44,42625.620156,1.198657e+06,USS Wallace\nFPO AE 73316


In [4]:
data.isna().sum()

Avg. Area Income                0
Avg. Area House Age             0
Avg. Area Number of Rooms       0
Avg. Area Number of Bedrooms    0
Area Population                 0
Price                           0
Address                         0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
 6   Address                       5000 non-null   object 
dtypes: float64(6), object(1)
memory usage: 273.6+ KB


In [6]:
data['Address'].value_counts()

Address
208 Michael Ferry Apt. 674\nLaurabury, NE 37010-5101             1
314 Christopher Square Apt. 404\nLake Ronaldville, SD 42025      1
21042 Wilson Islands Suite 238\nFischerchester, MP 42425-4129    1
Unit 8831 Box 5748\nDPO AE 73012-7314                            1
481 Kaitlin Mission Apt. 309\nJodystad, IA 16947                 1
                                                                ..
054 Carter Crescent Suite 674\nGlennport, WA 11140               1
8460 Kathleen Mission Apt. 482\nPort Amytown, KY 72016           1
3737 Hartman Rue\nReneestad, ID 69250-7718                       1
3465 Latoya Well\nNelsonmouth, MI 55741-4287                     1
37778 George Ridges Apt. 509\nEast Holly, NV 29290-3595          1
Name: count, Length: 5000, dtype: int64

# Income

In [8]:
data['Avg. Area Income'].value_counts().sort_index()

Avg. Area Income
17796.631190     1
35454.714659     1
35608.986237     1
35797.323122     1
35963.330809     1
                ..
101599.670580    1
101928.858060    1
102881.120902    1
104702.724257    1
107701.748378    1
Name: count, Length: 5000, dtype: int64

In [9]:
income_zscore=(data['Avg. Area Income']-data['Avg. Area Income'].mean())/data['Avg. Area Income'].std()
income_zscore.sort_values()

39     -4.765108
3069   -3.108315
2092   -3.093840
4855   -3.076169
1459   -3.060593
          ...   
2719    3.097822
962     3.128709
3541    3.218056
1734    3.388970
693     3.670358
Name: Avg. Area Income, Length: 5000, dtype: float64

In [10]:
def z_score_sort(column):
    z=(column-column.mean())/column.std()
    return z.sort_values()

In [11]:
z_score_sort(data['Avg. Area Income'])

39     -4.765108
3069   -3.108315
2092   -3.093840
4855   -3.076169
1459   -3.060593
          ...   
2719    3.097822
962     3.128709
3541    3.218056
1734    3.388970
693     3.670358
Name: Avg. Area Income, Length: 5000, dtype: float64

In [12]:
data['Avg. Area Income']=standardScaler.fit_transform(data[['Avg. Area Income']])
data[['Avg. Area Income']]

Unnamed: 0,Avg. Area Income
0,1.028660
1,1.000808
2,-0.684629
3,-0.491499
4,-0.807073
...,...
4995,-0.752109
4996,0.929740
4997,-0.487235
4998,-0.054592


# Age

In [14]:
data['Avg. Area House Age'].value_counts().sort_index()

Avg. Area House Age
2.644304    1
2.683043    1
2.797215    1
2.797619    1
2.922736    1
           ..
8.973441    1
8.991399    1
9.008900    1
9.125283    1
9.519088    1
Name: count, Length: 5000, dtype: int64

In [15]:
z_score_sort(data['Avg. Area House Age'])

1074   -3.361639
1628   -3.322567
4488   -3.207411
4565   -3.207003
2465   -3.080808
          ...   
1777    3.022038
1091    3.040152
3138    3.057803
2898    3.175189
3989    3.572388
Name: Avg. Area House Age, Length: 5000, dtype: float64

In [16]:
data['Avg. Area House Age']=standardScaler.fit_transform(data[['Avg. Area House Age']])

In [17]:
data[['Avg. Area House Age']]

Unnamed: 0,Avg. Area House Age
0,-0.296927
1,0.025902
2,-0.112303
3,1.221572
4,-0.944834
...,...
4995,1.869297
4996,1.030822
4997,1.284470
4998,-0.446694


# Rooms

In [19]:
data['Avg. Area Number of Rooms'].value_counts().sort_index()

Avg. Area Number of Rooms
3.236194     1
3.950225     1
3.950973     1
3.969632     1
4.027931     1
            ..
10.024375    1
10.144988    1
10.219902    1
10.280022    1
10.759588    1
Name: count, Length: 5000, dtype: int64

In [20]:
z_score_sort(data['Avg. Area Number of Rooms'])

496    -3.729841
2771   -3.019951
1799   -3.019207
1757   -3.000657
3922   -2.942696
          ...   
3806    3.018973
3855    3.138886
2066    3.213366
3336    3.273138
1536    3.749922
Name: Avg. Area Number of Rooms, Length: 5000, dtype: float64

In [21]:
data['Avg. Area Number of Rooms']=standardScaler.fit_transform(data[['Avg. Area Number of Rooms']])

In [22]:
data[['Avg. Area Number of Rooms']]

Unnamed: 0,Avg. Area Number of Rooms
0,0.021274
1,-0.255506
2,1.516243
3,-1.393077
4,0.846742
...,...
4995,-0.845588
4996,-0.408686
4997,-2.170269
4998,0.141541


# Bedrooms

In [24]:
data['Avg. Area Number of Bedrooms'].value_counts().sort_index()

Avg. Area Number of Bedrooms
2.00    20
2.01    16
2.02    20
2.03    14
2.04    25
        ..
6.46    11
6.47     7
6.48     8
6.49    14
6.50    14
Name: count, Length: 255, dtype: int64

In [25]:
z_score_sort(data['Avg. Area Number of Bedrooms'])

3313   -1.605437
662    -1.605437
1729   -1.605437
2947   -1.605437
986    -1.605437
          ...   
4689    2.040835
4051    2.040835
4902    2.040835
1995    2.040835
3552    2.040835
Name: Avg. Area Number of Bedrooms, Length: 5000, dtype: float64

# Population

In [27]:
data['Area Population'].value_counts().sort_index()

Area Population
172.610686      1
3285.450538     1
3883.448164     1
4114.489353     1
5727.485885     1
               ..
68311.695822    1
69553.988327    1
69575.449464    1
69592.040236    1
69621.713378    1
Name: count, Length: 5000, dtype: int64

In [28]:
z_score_sort(data['Area Population'])

314    -3.626050
1530   -3.312434
2756   -3.252187
2534   -3.228910
4491   -3.066402
          ...   
3991    3.238899
353     3.364059
228     3.366221
1595    3.367893
4803    3.370882
Name: Area Population, Length: 5000, dtype: float64

In [29]:
data['Area Population']=standardScaler.fit_transform(data[['Area Population']])

# Price

In [31]:
#target

# Address

In [33]:
data['Address'].value_counts()

Address
208 Michael Ferry Apt. 674\nLaurabury, NE 37010-5101             1
314 Christopher Square Apt. 404\nLake Ronaldville, SD 42025      1
21042 Wilson Islands Suite 238\nFischerchester, MP 42425-4129    1
Unit 8831 Box 5748\nDPO AE 73012-7314                            1
481 Kaitlin Mission Apt. 309\nJodystad, IA 16947                 1
                                                                ..
054 Carter Crescent Suite 674\nGlennport, WA 11140               1
8460 Kathleen Mission Apt. 482\nPort Amytown, KY 72016           1
3737 Hartman Rue\nReneestad, ID 69250-7718                       1
3465 Latoya Well\nNelsonmouth, MI 55741-4287                     1
37778 George Ridges Apt. 509\nEast Holly, NV 29290-3595          1
Name: count, Length: 5000, dtype: int64

In [34]:
import re
data['state'] = [re.search(r"([A-Z]{2})\s\d{5}", address).group(1) for address in data['Address']]

data['state']

0       NE
1       CA
2       WI
3       AP
4       AE
        ..
4995    AP
4996    AA
4997    VA
4998    AE
4999    NV
Name: state, Length: 5000, dtype: object

In [35]:
data['state'].value_counts()

state
AA    177
AP    170
AE    167
GU     91
OR     91
     ... 
KS     67
WI     67
IL     64
MD     62
VI     55
Name: count, Length: 62, dtype: int64

In [36]:
data['state'].unique()

array(['NE', 'CA', 'WI', 'AP', 'AE', 'KS', 'CO', 'TN', 'AA', 'NM', 'PW',
       'AR', 'HI', 'ME', 'IN', 'MI', 'DE', 'AZ', 'MA', 'MN', 'AL', 'NY',
       'NV', 'VA', 'ID', 'OK', 'NH', 'MO', 'WV', 'WY', 'MH', 'UT', 'SD',
       'CT', 'AK', 'WA', 'RI', 'NJ', 'KY', 'NC', 'IA', 'VT', 'FM', 'ND',
       'LA', 'MP', 'OR', 'TX', 'DC', 'PR', 'MT', 'AS', 'OH', 'MS', 'IL',
       'VI', 'GA', 'PA', 'MD', 'SC', 'GU', 'FL'], dtype=object)

In [1]:
#data['state'].tolist()

In [38]:
region=[]
for state in data['state'].tolist():
    if state in ['ME', 'NH', 'VT', 'MA', 'RI', 'CT', 'NY', 'NJ', 'PA']:
        region.append('Northeast')
    elif state in ['OH', 'MI', 'IN', 'IL', 'WI', 'MO', 'ND', 'SD', 'NE', 'KS', 'MN', 'IA']:
        region.append('Midwest')
    elif state in ['DE', 'MD', 'VA', 'WV', 'KY', 'NC', 'SC', 'GA', 'FL', 'AL', 'TN', 'MS', 'AR', 'LA', 'TX', 'OK']:
        region.append('South')
    elif state in ['MT', 'WY', 'CO', 'NM', 'AZ', 'UT', 'NV', 'ID', 'WA', 'OR', 'CA', 'AK', 'HI']:
        region.append('West')
    elif state in ['DC', 'PR', 'GU', 'VI', 'AS', 'MP', 'FM', 'PW', 'MH', 'AP', 'AE', 'AA']:
        region.append('US Territories')
    else:
        region.append('Unknown')
        

In [39]:
data['region']=region

In [40]:
data.region.value_counts()

region
South             1237
US Territories    1184
West               994
Midwest            901
Northeast          684
Name: count, dtype: int64

In [41]:
data

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,state,region
0,1.028660,-0.296927,0.021274,4.09,-1.317599,1.059034e+06,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",NE,Midwest
1,1.000808,0.025902,-0.255506,3.09,0.403999,1.505891e+06,"188 Johnson Views Suite 079\nLake Kathleen, CA...",CA,West
2,-0.684629,-0.112303,1.516243,5.13,0.072410,1.058988e+06,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",WI,Midwest
3,-0.491499,1.221572,-1.393077,3.26,-0.186734,1.260617e+06,USS Barnett\nFPO AP 44820,AP,US Territories
4,-0.807073,-0.944834,0.846742,4.23,-0.988387,6.309435e+05,USNS Raymond\nFPO AE 09386,AE,US Territories
...,...,...,...,...,...,...,...,...,...
4995,-0.752109,1.869297,-0.845588,3.46,-1.342732,1.060194e+06,USNS Williams\nFPO AP 30153-7653,AP,US Territories
4996,0.929740,1.030822,-0.408686,4.02,-1.062747,1.482618e+06,"PSC 9258, Box 8489\nAPO AA 42991-3352",AA,US Territories
4997,-0.487235,1.284470,-2.170269,2.13,-0.291937,1.030730e+06,"4215 Tracy Garden Suite 076\nJoshualand, VA 01...",VA,South
4998,-0.054592,-0.446694,0.141541,5.44,0.651116,1.198657e+06,USS Wallace\nFPO AE 73316,AE,US Territories


In [42]:
data=data.drop(['Address','state'],axis=1)

In [43]:
data

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,region
0,1.028660,-0.296927,0.021274,4.09,-1.317599,1.059034e+06,Midwest
1,1.000808,0.025902,-0.255506,3.09,0.403999,1.505891e+06,West
2,-0.684629,-0.112303,1.516243,5.13,0.072410,1.058988e+06,Midwest
3,-0.491499,1.221572,-1.393077,3.26,-0.186734,1.260617e+06,US Territories
4,-0.807073,-0.944834,0.846742,4.23,-0.988387,6.309435e+05,US Territories
...,...,...,...,...,...,...,...
4995,-0.752109,1.869297,-0.845588,3.46,-1.342732,1.060194e+06,US Territories
4996,0.929740,1.030822,-0.408686,4.02,-1.062747,1.482618e+06,US Territories
4997,-0.487235,1.284470,-2.170269,2.13,-0.291937,1.030730e+06,South
4998,-0.054592,-0.446694,0.141541,5.44,0.651116,1.198657e+06,US Territories


In [44]:
region_onehot=pd.get_dummies(data.region, dtype='int',drop_first=True)

In [45]:
data=pd.concat([data,region_onehot],axis=1)
data

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,region,Northeast,South,US Territories,West
0,1.028660,-0.296927,0.021274,4.09,-1.317599,1.059034e+06,Midwest,0,0,0,0
1,1.000808,0.025902,-0.255506,3.09,0.403999,1.505891e+06,West,0,0,0,1
2,-0.684629,-0.112303,1.516243,5.13,0.072410,1.058988e+06,Midwest,0,0,0,0
3,-0.491499,1.221572,-1.393077,3.26,-0.186734,1.260617e+06,US Territories,0,0,1,0
4,-0.807073,-0.944834,0.846742,4.23,-0.988387,6.309435e+05,US Territories,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
4995,-0.752109,1.869297,-0.845588,3.46,-1.342732,1.060194e+06,US Territories,0,0,1,0
4996,0.929740,1.030822,-0.408686,4.02,-1.062747,1.482618e+06,US Territories,0,0,1,0
4997,-0.487235,1.284470,-2.170269,2.13,-0.291937,1.030730e+06,South,0,1,0,0
4998,-0.054592,-0.446694,0.141541,5.44,0.651116,1.198657e+06,US Territories,0,0,1,0


In [46]:
data=data.drop('region',axis=1)
data

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Northeast,South,US Territories,West
0,1.028660,-0.296927,0.021274,4.09,-1.317599,1.059034e+06,0,0,0,0
1,1.000808,0.025902,-0.255506,3.09,0.403999,1.505891e+06,0,0,0,1
2,-0.684629,-0.112303,1.516243,5.13,0.072410,1.058988e+06,0,0,0,0
3,-0.491499,1.221572,-1.393077,3.26,-0.186734,1.260617e+06,0,0,1,0
4,-0.807073,-0.944834,0.846742,4.23,-0.988387,6.309435e+05,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
4995,-0.752109,1.869297,-0.845588,3.46,-1.342732,1.060194e+06,0,0,1,0
4996,0.929740,1.030822,-0.408686,4.02,-1.062747,1.482618e+06,0,0,1,0
4997,-0.487235,1.284470,-2.170269,2.13,-0.291937,1.030730e+06,0,1,0,0
4998,-0.054592,-0.446694,0.141541,5.44,0.651116,1.198657e+06,0,0,1,0


In [47]:
data.columns

Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Northeast',
       'South', 'US Territories', 'West'],
      dtype='object')

In [48]:
X=data.drop('Price',axis=1)
y=data['Price']

In [49]:
X

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Northeast,South,US Territories,West
0,1.028660,-0.296927,0.021274,4.09,-1.317599,0,0,0,0
1,1.000808,0.025902,-0.255506,3.09,0.403999,0,0,0,1
2,-0.684629,-0.112303,1.516243,5.13,0.072410,0,0,0,0
3,-0.491499,1.221572,-1.393077,3.26,-0.186734,0,0,1,0
4,-0.807073,-0.944834,0.846742,4.23,-0.988387,0,0,1,0
...,...,...,...,...,...,...,...,...,...
4995,-0.752109,1.869297,-0.845588,3.46,-1.342732,0,0,1,0
4996,0.929740,1.030822,-0.408686,4.02,-1.062747,0,0,1,0
4997,-0.487235,1.284470,-2.170269,2.13,-0.291937,0,1,0,0
4998,-0.054592,-0.446694,0.141541,5.44,0.651116,0,0,1,0


In [50]:
y

0       1.059034e+06
1       1.505891e+06
2       1.058988e+06
3       1.260617e+06
4       6.309435e+05
            ...     
4995    1.060194e+06
4996    1.482618e+06
4997    1.030730e+06
4998    1.198657e+06
4999    1.298950e+06
Name: Price, Length: 5000, dtype: float64

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4000, 9)
(1000, 9)
(4000,)
(1000,)


In [53]:
model=LinearRegression()

In [54]:
model.fit(X_train,y_train)

In [55]:
y_pred=model.predict(X_test)

In [56]:
y_pred

array([1307239.39227752, 1234829.89184411, 1248184.51313506,
       1233466.01918569, 1067516.07759246, 1540454.43281495,
       1099169.61960449,  837709.36537762,  791772.13372691,
       1474277.37690565,  667940.42044507, 1604389.45760742,
       1001871.5402059 , 1794865.30966015, 1286090.85330984,
       1085524.4192091 , 1427632.7071667 , 1074540.10982822,
        806758.79362329,  934149.99836632, 1132560.92481432,
        914190.9672614 , 1487867.27325117, 1289157.02112343,
       1586587.64664415, 1130969.05393212, 1088418.72978355,
        973054.39305417,  927569.28796753, 1745091.91420993,
       1290194.40738784, 1617892.0759791 , 1432830.27996573,
       1238345.83612591, 1483448.12340496, 1722726.75743084,
       1536509.8015884 ,  780260.23222388, 1761494.59946777,
       1173726.34173266, 1558347.459958  ,  900941.11856105,
       1368831.99640629,  841581.57835537, 1198673.74018968,
       1129396.91184589, 1361728.13084228, 1447824.69423662,
       1578752.29598072,

In [57]:
y_test

1501    1.339096e+06
2586    1.251794e+06
2653    1.340095e+06
1055    1.431508e+06
705     1.042374e+06
            ...     
4711    1.107031e+06
2313    1.405505e+06
3214    1.924156e+06
2732    1.571254e+06
1926    8.831475e+05
Name: Price, Length: 1000, dtype: float64

In [58]:
model.coef_

array([230823.64852461, 163359.21821032, 120295.64483813,   2374.27824658,
       151477.5374141 ,   6973.60814391,   1368.04920912,   8067.37239459,
         2181.57334118])

In [59]:
model.intercept_

1218948.7183183695

In [60]:
print(f"RMSE  {np.sqrt(mean_squared_error(y_test, y_pred))}") # RMSE = sqrt( (1/n) * Σ(actual - predicted)² )

print(f"MSE  {mean_squared_error(y_test, y_pred)}") # MSE = (1/n) * Σ(actual - predicted)²

print(f"MAE  {mean_absolute_error(y_test, y_pred)}") # MAE = (1/n) * Σ|actual - predicted|

print(f"R2  {r2_score(y_test, y_pred)}")# R² = 1 - (Σ(actual - predicted)² / Σ(actual - mean(actual))²)
             

RMSE  100433.35754553918
MSE  10086859307.870113
MAE  80891.67003962323
R2  0.9180146456907119


In [61]:
alpha_values = [0.00001,0.0001,0.001,0.01, 0.1, 1, 10, 100,1000]

In [62]:
ridge=[]
lasso=[]

for alpha in alpha_values:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2=r2_score(y_test, y_pred)
    ridge.append((alpha, rmse,r2))


for alpha in alpha_values:
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    y_pred = lasso_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2=r2_score(y_test, y_pred)
    lasso.append((alpha, rmse,r2))


In [63]:
ridge

[(1e-05, 100433.35754354938, 0.9180146456939605),
 (0.0001, 100433.35752564181, 0.918014645723197),
 (0.001, 100433.35734660257, 0.9180146460155022),
 (0.01, 100433.35555984847, 0.918014648932614),
 (0.1, 100433.33805611865, 0.9180146775097595),
 (1, 100433.19936199598, 0.9180149039460155),
 (10, 100435.41043816188, 0.9180112940383671),
 (100, 100789.67372335125, 0.9174318805808962),
 (1000, 121931.71869004893, 0.8791591683658301)]

In [64]:
lasso

[(1e-05, 100433.35754509071, 0.9180146456914442),
 (0.0001, 100433.35754104829, 0.9180146456980439),
 (0.001, 100433.35750071851, 0.9180146457638876),
 (0.01, 100433.35709678542, 0.9180146464233616),
 (0.1, 100433.35305638253, 0.9180146530198523),
 (1, 100433.31332810935, 0.9180147178814848),
 (10, 100432.97254830021, 0.9180152742483011),
 (100, 100430.89892020007, 0.9180186596718676),
 (1000, 100450.63706382507, 0.9179864321701823)]

In [65]:
data

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Northeast,South,US Territories,West
0,1.028660,-0.296927,0.021274,4.09,-1.317599,1.059034e+06,0,0,0,0
1,1.000808,0.025902,-0.255506,3.09,0.403999,1.505891e+06,0,0,0,1
2,-0.684629,-0.112303,1.516243,5.13,0.072410,1.058988e+06,0,0,0,0
3,-0.491499,1.221572,-1.393077,3.26,-0.186734,1.260617e+06,0,0,1,0
4,-0.807073,-0.944834,0.846742,4.23,-0.988387,6.309435e+05,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
4995,-0.752109,1.869297,-0.845588,3.46,-1.342732,1.060194e+06,0,0,1,0
4996,0.929740,1.030822,-0.408686,4.02,-1.062747,1.482618e+06,0,0,1,0
4997,-0.487235,1.284470,-2.170269,2.13,-0.291937,1.030730e+06,0,1,0,0
4998,-0.054592,-0.446694,0.141541,5.44,0.651116,1.198657e+06,0,0,1,0


In [66]:
data.columns

Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Northeast',
       'South', 'US Territories', 'West'],
      dtype='object')

In [67]:
df_corr=data[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price']].corr()

df_corr

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
Avg. Area Income,1.0,-0.002007,-0.011032,0.019788,-0.016234,0.639734
Avg. Area House Age,-0.002007,1.0,-0.009428,0.006149,-0.018743,0.452543
Avg. Area Number of Rooms,-0.011032,-0.009428,1.0,0.462695,0.00204,0.335664
Avg. Area Number of Bedrooms,0.019788,0.006149,0.462695,1.0,-0.022168,0.171071
Area Population,-0.016234,-0.018743,0.00204,-0.022168,1.0,0.408556
Price,0.639734,0.452543,0.335664,0.171071,0.408556,1.0


In [68]:
data.drop(['Avg. Area Number of Bedrooms','Price'],axis=1)

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Area Population,Northeast,South,US Territories,West
0,1.028660,-0.296927,0.021274,-1.317599,0,0,0,0
1,1.000808,0.025902,-0.255506,0.403999,0,0,0,1
2,-0.684629,-0.112303,1.516243,0.072410,0,0,0,0
3,-0.491499,1.221572,-1.393077,-0.186734,0,0,1,0
4,-0.807073,-0.944834,0.846742,-0.988387,0,0,1,0
...,...,...,...,...,...,...,...,...
4995,-0.752109,1.869297,-0.845588,-1.342732,0,0,1,0
4996,0.929740,1.030822,-0.408686,-1.062747,0,0,1,0
4997,-0.487235,1.284470,-2.170269,-0.291937,0,1,0,0
4998,-0.054592,-0.446694,0.141541,0.651116,0,0,1,0


In [69]:
X = data.drop(['Avg. Area Number of Bedrooms','Price'],axis=1)
y = data['Price']

In [70]:
X

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Area Population,Northeast,South,US Territories,West
0,1.028660,-0.296927,0.021274,-1.317599,0,0,0,0
1,1.000808,0.025902,-0.255506,0.403999,0,0,0,1
2,-0.684629,-0.112303,1.516243,0.072410,0,0,0,0
3,-0.491499,1.221572,-1.393077,-0.186734,0,0,1,0
4,-0.807073,-0.944834,0.846742,-0.988387,0,0,1,0
...,...,...,...,...,...,...,...,...
4995,-0.752109,1.869297,-0.845588,-1.342732,0,0,1,0
4996,0.929740,1.030822,-0.408686,-1.062747,0,0,1,0
4997,-0.487235,1.284470,-2.170269,-0.291937,0,1,0,0
4998,-0.054592,-0.446694,0.141541,0.651116,0,0,1,0


In [71]:
y

0       1.059034e+06
1       1.505891e+06
2       1.058988e+06
3       1.260617e+06
4       6.309435e+05
            ...     
4995    1.060194e+06
4996    1.482618e+06
4997    1.030730e+06
4998    1.198657e+06
4999    1.298950e+06
Name: Price, Length: 5000, dtype: float64

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [73]:
alpha_values = [0.00001,0.0001,0.001,0.01, 0.1, 1, 10, 100,1000]

In [74]:
ridge = []
lasso = []

In [75]:
ridge=[]
lasso=[]

for alpha in alpha_values:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2=r2_score(y_test, y_pred)
    ridge.append((alpha, rmse,r2))


for alpha in alpha_values:
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    y_pred = lasso_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2=r2_score(y_test, y_pred)
    lasso.append((alpha, rmse,r2))


In [76]:
ridge

[(1e-05, 100359.99692330908, 0.9181343728438403),
 (0.0001, 100359.99683607447, 0.9181343729861582),
 (0.001, 100359.99596376353, 0.9181343744092807),
 (0.01, 100359.98724418295, 0.9181343886347473),
 (0.1, 100359.90040120186, 0.9181345303137343),
 (1, 100359.06721801, 0.9181358895946808),
 (10, 100354.22581953263, 0.9181437877792825),
 (100, 100629.27624517499, 0.9176944705777721),
 (1000, 121169.25972703574, 0.8806657181772175)]

In [77]:
lasso

[(1e-05, 100359.99693268699, 0.9181343728285407),
 (0.0001, 100359.99692983544, 0.9181343728331929),
 (0.001, 100359.99690157283, 0.9181343728793017),
 (0.01, 100359.99661712772, 0.9181343733433566),
 (0.1, 100359.99375990152, 0.918134378004748),
 (1, 100359.96605103268, 0.918134423210081),
 (10, 100359.7449388967, 0.9181347839406255),
 (100, 100359.51801846207, 0.9181351541462178),
 (1000, 100388.59237850273, 0.918087714445035)]

In [78]:
alpha_values = np.linspace(0.0001,1000, num=100)  # 100 deyer  ,0.0001 ve 1000 arasinda
alpha_values

array([1.00000000e-04, 1.01011091e+01, 2.02021182e+01, 3.03031273e+01,
       4.04041364e+01, 5.05051455e+01, 6.06061545e+01, 7.07071636e+01,
       8.08081727e+01, 9.09091818e+01, 1.01010191e+02, 1.11111200e+02,
       1.21212209e+02, 1.31313218e+02, 1.41414227e+02, 1.51515236e+02,
       1.61616245e+02, 1.71717255e+02, 1.81818264e+02, 1.91919273e+02,
       2.02020282e+02, 2.12121291e+02, 2.22222300e+02, 2.32323309e+02,
       2.42424318e+02, 2.52525327e+02, 2.62626336e+02, 2.72727345e+02,
       2.82828355e+02, 2.92929364e+02, 3.03030373e+02, 3.13131382e+02,
       3.23232391e+02, 3.33333400e+02, 3.43434409e+02, 3.53535418e+02,
       3.63636427e+02, 3.73737436e+02, 3.83838445e+02, 3.93939455e+02,
       4.04040464e+02, 4.14141473e+02, 4.24242482e+02, 4.34343491e+02,
       4.44444500e+02, 4.54545509e+02, 4.64646518e+02, 4.74747527e+02,
       4.84848536e+02, 4.94949545e+02, 5.05050555e+02, 5.15151564e+02,
       5.25252573e+02, 5.35353582e+02, 5.45454591e+02, 5.55555600e+02,
      

In [79]:
ridge=[]
lasso=[]

for alpha in alpha_values:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2=r2_score(y_test, y_pred)
    ridge.append((alpha, rmse,r2))


for alpha in alpha_values:
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    y_pred = lasso_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2=r2_score(y_test, y_pred)
    lasso.append((alpha, rmse,r2))


In [80]:
ridge

[(0.0001, 100359.99683607447, 0.9181343729861582),
 (10.101109090909091, 100354.2071397946, 0.9181438182523881),
 (20.202118181818182, 100356.27307412364, 0.9181404479654531),
 (30.303127272727274, 100365.99859595818, 0.9181245811857771),
 (40.40413636363637, 100383.21040276703, 0.9180964970787102),
 (50.505145454545456, 100407.75079501545, 0.9180564467602806),
 (60.60615454545455, 100439.47304989364, 0.9180046610189178),
 (70.70716363636365, 100478.23835828791, 0.9179413554697672),
 (80.80817272727273, 100523.91375917943, 0.9178667340702733),
 (90.90918181818182, 100576.37072460682, 0.917780991566132),
 (101.01019090909091, 100635.48417738084, 0.9176843152248503),
 (111.11120000000001, 100701.13180177854, 0.9175768860860648),
 (121.2122090909091, 100773.19355574384, 0.9174588798785318),
 (131.3132181818182, 100851.55132364921, 0.9173304677036256),
 (141.4142272727273, 100936.08866834082, 0.9171918165529418),
 (151.51523636363638, 101026.6906540888, 0.9170430897064555),
 (161.616245454

In [81]:
lasso

[(0.0001, 100359.99692983544, 0.9181343728331929),
 (10.101109090909091, 100359.7431093848, 0.9181347869253559),
 (20.202118181818182, 100359.61870372058, 0.9181349898850204),
 (30.303127272727274, 100359.6263514285, 0.9181349774082949),
 (40.40413636363637, 100359.7611460074, 0.9181347574997707),
 (50.505145454545456, 100360.02228088508, 0.9181343314744862),
 (60.60615454545455, 100359.97413990849, 0.9181344100135737),
 (70.70716363636365, 100359.79489835611, 0.9181347024349789),
 (80.80817272727273, 100359.65864276479, 0.9181349247271222),
 (90.90918181818182, 100359.56537393082, 0.9181350768889902),
 (101.01019090909091, 100359.51489675837, 0.9181351592390639),
 (111.11120000000001, 100359.50905133899, 0.9181351687754654),
 (121.2122090909091, 100359.4559898425, 0.9181352553416383),
 (131.3132181818182, 100359.15813488087, 0.9181357412706214),
 (141.4142272727273, 100358.88427546271, 0.9181361880513256),
 (151.51523636363638, 100358.63441178444, 0.9181365956837511),
 (161.6162454545

In [83]:
ridge_results_df=pd.DataFrame(ridge, columns=['Alpha', 'RMSE','r2']).set_index('Alpha')
lasso_results_df=pd.DataFrame(lasso, columns=['Alpha', 'RMSE','r2']).set_index('Alpha')

In [85]:
ridge_results_df

Unnamed: 0_level_0,RMSE,r2
Alpha,Unnamed: 1_level_1,Unnamed: 2_level_1
0.000100,100359.996836,0.918134
10.101109,100354.207140,0.918144
20.202118,100356.273074,0.918140
30.303127,100365.998596,0.918125
40.404136,100383.210403,0.918096
...,...,...
959.595964,119932.637186,0.883089
969.696973,120241.015666,0.882487
979.797982,120549.931041,0.881882
989.898991,120859.354970,0.881275


In [87]:
lasso_results_df

Unnamed: 0_level_0,RMSE,r2
Alpha,Unnamed: 1_level_1,Unnamed: 2_level_1
0.000100,100359.996930,0.918134
10.101109,100359.743109,0.918135
20.202118,100359.618704,0.918135
30.303127,100359.626351,0.918135
40.404136,100359.761146,0.918135
...,...,...
959.595964,100385.882534,0.918092
969.696973,100386.544864,0.918091
979.797982,100387.217281,0.918090
989.898991,100387.899786,0.918089


In [89]:
ridge_results_df['RMSE'].idxmin()

10.101109090909091

In [91]:
lasso_results_df['RMSE'].idxmin()

252.52532727272728

In [93]:
ridge_results_df['RMSE'].min()

100354.2071397946

In [95]:
lasso_results_df['RMSE'].min()

100357.45605804077

In [97]:
X = data.drop(['Avg. Area Number of Bedrooms','Avg. Area Number of Rooms','Price'],axis=1)
y = data['Price']

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [101]:
model=LinearRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

In [103]:
alpha_values = [0.00001,0.0001,0.001,0.01, 0.1, 1, 10, 100,1000]
ridge=[]
lasso=[]

for alpha in alpha_values:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2=r2_score(y_test, y_pred)
    ridge.append((alpha, rmse,r2))


for alpha in alpha_values:
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    y_pred = lasso_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2=r2_score(y_test, y_pred)
    lasso.append((alpha, rmse,r2))


In [104]:
ridge

[(1e-05, 158900.34878980208, 0.7947751175657056),
 (0.0001, 158900.34875465563, 0.7947751176564912),
 (0.001, 158900.34840321227, 0.7947751185642918),
 (0.01, 158900.3448909117, 0.7947751276367887),
 (0.1, 158900.3099811512, 0.7947752178109209),
 (1, 158899.98217211576, 0.7947760645615473),
 (10, 158898.79898800625, 0.7947791207833538),
 (100, 159073.64727728532, 0.7943272323864565),
 (1000, 171169.81190466418, 0.7618587321708499)]

In [107]:
lasso

[(1e-05, 158900.34879082264, 0.7947751175630694),
 (0.0001, 158900.34876496237, 0.7947751176298681),
 (0.001, 158900.34850555175, 0.7947751182999424),
 (0.01, 158900.34592154424, 0.794775124974599),
 (0.1, 158900.32001217434, 0.794775191900152),
 (1, 158900.06076377985, 0.7947758615546701),
 (10, 158897.5114702603, 0.7947824464781361),
 (100, 158890.82385839173, 0.7947997203353923),
 (1000, 158859.1722726942, 0.7948814653622591)]