In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/usahousing/USA_Housing.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
import seaborn as sns

sns.pairplot(df)

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

plt.subplots(figsize =(10,10))
sns.heatmap(df.corr(),annot=True,linewidths=0.2,cmap='viridis');

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
x = df.drop(['Price','Address'],axis=1)
y = df.Price

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=38)

In [None]:
# Fit LinearRegression model on the training set
model = LinearRegression()
scaler = StandardScaler()
pipe = make_pipeline(scaler,model)
pipe = pipe.fit(X_train,y_train)

# Predict on test set
predictions = pipe.predict(X_test)

# Evaluate the model using the test data
mse = mean_squared_error(y_test, predictions)
print("MSE:", mse)
rmse = np.sqrt(mse)
print("RMSE:", rmse)
r2 = r2_score(y_test, predictions)
print("R2:", r2)

# Plot predicted vs actual
plt.subplots(figsize=(10,8))
plt.scatter(y_test, predictions)
plt.xlabel('Actual Labels')
plt.ylabel('Predicted Labels')
# overlay the regression line
z = np.polyfit(y_test, predictions, 1)
p = np.poly1d(z)
plt.plot(y_test,p(y_test), color='red')
plt.show()

In [None]:
# plot show features importance 
coef_ = pipe.named_steps['linearregression'].coef_

plt.subplots(figsize=(15,8))
plt.plot(x.columns,coef_)
plt.ylabel('Coefficients')
plt.show()

### Lasso

In [None]:
# Fit a lasso model on the training set

model = Lasso(alpha=0.1)
scaler = StandardScaler()
pipe = make_pipeline(scaler,model)
pipe = pipe.fit(X_train,y_train)

# Predict on test set
predictions = pipe.predict(X_test)

# Evaluate the model using the test data
mse = mean_squared_error(y_test, predictions)
print("MSE:", mse)
rmse = np.sqrt(mse)
print("RMSE:", rmse)
r2 = r2_score(y_test, predictions)
print("R2:", r2)

# Plot predicted vs actual
plt.subplots(figsize=(10,8))
plt.scatter(y_test, predictions)
plt.xlabel('Actual Labels')
plt.ylabel('Predicted Labels')
# overlay the regression line
z = np.polyfit(y_test, predictions, 1)
p = np.poly1d(z)
plt.plot(y_test,p(y_test), color='red')
plt.show()

In [None]:
# plot show features importance 
coef_ = pipe.named_steps['lasso'].coef_

plt.subplots(figsize=(15,8))
plt.plot(x.columns,coef_)
plt.ylabel('Coefficients')
plt.show()

### Ridge

In [None]:
# Fit a ridge model on the training set

model = Ridge(alpha=1)
scaler = StandardScaler()
pipe = make_pipeline(scaler,model)
pipe = pipe.fit(X_train,y_train)

# Predict on test set
predictions = pipe.predict(X_test)

# Evaluate the model using the test data
mse = mean_squared_error(y_test, predictions)
print("MSE:", mse)
rmse = np.sqrt(mse)
print("RMSE:", rmse)
r2 = r2_score(y_test, predictions)
print("R2:", r2)

# Plot predicted vs actual
plt.subplots(figsize=(10,8))
plt.scatter(y_test, predictions)
plt.xlabel('Actual Labels')
plt.ylabel('Predicted Labels')
# overlay the regression line
z = np.polyfit(y_test, predictions, 1)
p = np.poly1d(z)
plt.plot(y_test,p(y_test), color='red')
plt.show()

In [None]:
# plot show features importance 
coef_ = pipe.named_steps['ridge'].coef_

plt.subplots(figsize=(15,8))
plt.plot(x.columns,coef_)
plt.ylabel('Coefficients')
plt.show()

### ElasticNet

In [None]:
# Fit a ElastiNet model on the training set

model = ElasticNet(alpha=0.001, l1_ratio=0.01)
scaler = StandardScaler()
pipe = make_pipeline(scaler,model)
pipe = pipe.fit(X_train,y_train)

# Predict on test set
predictions = pipe.predict(X_test)

# Evaluate the model using the test data
mse = mean_squared_error(y_test, predictions)
print("MSE:", mse)
rmse = np.sqrt(mse)
print("RMSE:", rmse)
r2 = r2_score(y_test, predictions)
print("R2:", r2)

# Plot predicted vs actual
plt.subplots(figsize=(10,8))
plt.scatter(y_test, predictions)
plt.xlabel('Actual Labels')
plt.ylabel('Predicted Labels')
# overlay the regression line
z = np.polyfit(y_test, predictions, 1)
p = np.poly1d(z)
plt.plot(y_test,p(y_test), color='red')
plt.show()

In [None]:
# plot show features importance 
coef_ = pipe.named_steps['elasticnet'].coef_

plt.subplots(figsize=(15,8))
plt.plot(x.columns,coef_)
plt.ylabel('Coefficients')
plt.show()

### Remove Avg. Area Number of Bedrooms & increase test_size

In [None]:
x = df.drop(['Price','Address','Avg. Area Number of Bedrooms'],axis=1)
y = df.Price

# test_size 25%
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=38)

In [None]:
# Fit LinearRegression model on the training set
model = LinearRegression()
scaler = StandardScaler()
pipe = make_pipeline(scaler,model)
pipe = pipe.fit(X_train,y_train)

# Predict on test set
predictions = pipe.predict(X_test)

# Evaluate the model using the test data
mse = mean_squared_error(y_test, predictions)
print("MSE:", mse)
rmse = np.sqrt(mse)
print("RMSE:", rmse)
r2 = r2_score(y_test, predictions)
print("R2:", r2)

# Plot predicted vs actual
plt.subplots(figsize=(10,8))
plt.scatter(y_test, predictions)
plt.xlabel('Actual Labels')
plt.ylabel('Predicted Labels')
# overlay the regression line
z = np.polyfit(y_test, predictions, 1)
p = np.poly1d(z)
plt.plot(y_test,p(y_test), color='red')
plt.show()

![](http://)Comment : R2 reduce slightly with test_size increase from 20% to 25%

In [None]:
# test_size 20%
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=38)

In [None]:
# Fit LinearRegression model on the training set
model = LinearRegression()
scaler = StandardScaler()
pipe = make_pipeline(scaler,model)
pipe = pipe.fit(X_train,y_train)

# Predict on test set
predictions = pipe.predict(X_test)

# Evaluate the model using the test data
mse = mean_squared_error(y_test, predictions)
print("MSE:", mse)
rmse = np.sqrt(mse)
print("RMSE:", rmse)
r2 = r2_score(y_test, predictions)
print("R2:", r2)

# Plot predicted vs actual
plt.subplots(figsize=(10,8))
plt.scatter(y_test, predictions)
plt.xlabel('Actual Labels')
plt.ylabel('Predicted Labels')
# overlay the regression line
z = np.polyfit(y_test, predictions, 1)
p = np.poly1d(z)
plt.plot(y_test,p(y_test), color='red')
plt.show()

In [None]:
# plot show features importance 
coef_ = pipe.named_steps['linearregression'].coef_

plt.subplots(figsize=(15,8))
plt.plot(x.columns,coef_)
plt.ylabel('Coefficients')
plt.show()

### Conclusion:

* By reducing one feature (Avg. Area Number of Bedrooms) and test_size of 20%, the linear regression model give r2: 0.9135 , RMSE: 101938.
* The most important feature is avg. Area income, follow by avg. Area House Age & Area Population.
* The least important feature is avg.Area Number of Bedrooms