# Comparison between two linear regression models: OLS and using sklearn

## Importing libraries:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import statsmodels.api as sm

## Importing data set:

In [None]:
data=pd.read_csv('../input/usa-housing/USA_Housing.csv')

## Checking the head of the data set:

In [None]:
data.head()

## Defining the independent (x) and dependent (y) variables of the problem:

In [None]:
X=data.drop(['Address','Price'],axis=1)
y=data['Price']

## Performing a train/test split to avoid overfitting:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

## Performing a linear regression with sklearn:

In [None]:
lm = LinearRegression()

In [None]:
lm.fit(X_train,y_train)

In [None]:
predictions = lm.predict(X_test)

## Generating the predictions:

In [None]:
plt.scatter(y_test,predictions)

## Evaluating the amount of error of the linear regression model with sklearn:

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

## Displaying the coefficients of the resulting linear regression model:

In [None]:
coeff=pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff

## Displaying the original data set, without the categorical variables:

In [None]:
test=data.drop(['Address','Price'],axis=1)
test

### Performing an error calculation between the predicted price from the linear model and the actual price:

In [None]:
test['Pred_price']=lm.intercept_
for i in range(len(test)):
    for j in range(len(coeff)):
        test['Pred_price'].iloc[i]=test['Pred_price'].iloc[i]+coeff['Coefficient'].iloc[j]*test.iloc[i][j]
test['error']=abs(test['Pred_price']-data['Price'])
test['error_sq']=test['error']**2
test

## Now, performing a linear regression using the OLS model:

In [None]:
X=sm.add_constant(X)
ols=sm.OLS(y,X).fit()

In [None]:
ols.summary()

In [None]:
ols.params

### Performing an error calculation between the predicted price from the OLS linear model and the actual price:

In [None]:
test['Pred_price_ols']=ols.params[0]
for i in range(len(test)):
    j=1
    while j<len(ols.params):
        test['Pred_price_ols'].iloc[i]=test['Pred_price_ols'].iloc[i]+ols.params[j]*test.iloc[i][j-1]
        j+=1
test['error_ols']=abs(test['Pred_price_ols']-data['Price'])
test['error_ols_sq']=test['error_ols']**2
test

## Calculating the sum of error for the two linear regression models:

In [None]:
print('Sum of error sq for sklearn:',test['error_sq'].sum())
print('Sum of error sq for OLS:',test['error_ols_sq'].sum())

## Comparing the sum of errors:

In [None]:
test['error_sq'].sum()<test['error_ols_sq'].sum()

## In conclusion, for this specific data set, the OLS model performed better.