# Linear Regression

Build the linear regression model using scikit learn in boston data to predict
'Price' based on other dependent variable.


In [None]:
import numpy as np

import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
from sklearn.datasets import load_boston
import pickle
import warnings
# %matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
boston = load_boston()
bos = pd.DataFrame(boston.data)

In [None]:
bos.columns

In [None]:
bos

In [None]:
bos.columns=["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","black","lstat"]

In [None]:
bos['prices']=boston.target

In [None]:
bos

In [None]:
bos.shape

In [None]:
bos.info() # printing the summary of the dataframe

In [None]:
bos.isna().sum() # finding the count of missing values from different columns

Now, let's showcase the relationship between the feature and target column

In [None]:
x_label=bos.drop(['prices'],axis=1)
x_label

In [None]:
y_label=bos.prices
y_label

In [None]:
# fig, axs = plt.subplots(1, 3, sharey=True)
# for x in x_label.columns:
#     bos.plot(kind='scatter', x=x, y='prices')
# plt.tight_layout()

plt.figure(figsize=(20,30), facecolor='white')
plotnumber = 1

for column in x_label:
    if plotnumber<=15 :
        ax = plt.subplot(5,3,plotnumber)
        plt.scatter(x_label[column],y_label)
        plt.xlabel(column,fontsize=20)
        plt.ylabel('Prices',fontsize=20)
    plotnumber+=1
plt.tight_layout()

In [None]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model  import Ridge,Lasso,RidgeCV, LassoCV, ElasticNet, ElasticNetCV, LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
# let's see how data is distributed for every column
plt.figure(figsize=(20,25), facecolor='white')
plotnumber = 1

for column in bos:
    if plotnumber<=16 :
        ax = plt.subplot(4,4,plotnumber)
        sns.distplot(bos[column])
        plt.xlabel(column,fontsize=20)
        #plt.ylabel('Salary',fontsize=20)
    plotnumber+=1
plt.tight_layout()

In [None]:
scaler =StandardScaler()

x_scaled = scaler.fit_transform(x_label)

In [None]:
# let's see how data is distributed for every column
plt.figure(figsize=(20,25), facecolor='white')
plotnumber = 1

for column in bos:
    if plotnumber<=16 :
        ax = plt.subplot(4,4,plotnumber)
        sns.distplot(x_scaled)
        plt.xlabel(column,fontsize=20)
        #plt.ylabel('Salary',fontsize=20)
    plotnumber+=1
plt.tight_layout()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
variables = x_scaled

# we create a new data frame which will include all the VIFs
# note that each variable has its own variance inflation factor as this measure is variable specific (not model specific)
# we do not include categorical values for mulitcollinearity as they do not provide much information as numerical ones do
vif = pd.DataFrame()

# here we make use of the variance_inflation_factor, which will basically output the respective VIFs 
vif["VIF"] = [variance_inflation_factor(variables, i) for i in range(variables.shape[1])]
# Finally, I like to include names so it is easier to explore the result
vif["Features"] = x_label.columns

In [None]:
vif

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_scaled,y_label,test_size = 0.25,random_state=355)

In [None]:
print("x_train shape", x_train.shape)
print("x_test shape", x_test.shape)

In [None]:
print(pd.DataFrame(x_train).corr(method='pearson'))

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = pd.DataFrame(x_train).corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.CMRmap_r)
plt.show()                            

In [None]:
bos.columns

In [None]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
x_train_df=pd.DataFrame(x_train)

In [None]:
corr_features = correlation(x_train_df, 0.8)
len(set(corr_features))

In [None]:
corr_features

In [None]:
x_test_df=pd.DataFrame(x_test)
y_test_df=pd.DataFrame(x_test)
y_train_df=pd.DataFrame(y_train)
x_train_df=pd.DataFrame(x_train)

In [None]:
x_train_df.drop(corr_features,axis=1)

x_test_df.drop(corr_features,axis=1)


In [None]:
regression = LinearRegression()

regression.fit(x_train_df,y_train)

In [None]:
regression.score(x_train_df,y_train)

In [None]:
regression.score(x_test_df,y_test)

In [None]:
y_pred=regression.predict(x_test_df)

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(y_test, y_pred)

In [None]:
from sklearn.metrics import mean_squared_error


In [None]:
mean_squared_error(y_test,y_pred)

In [None]:
# saving the model to the local file system
filename = 'reg_finalized_model.pickle'
pickle.dump(regression, open(filename, 'wb'))

stats example

In [None]:
import statsmodels.api as sm


In [None]:
X_with_constant = sm.add_constant(x_train_df.to_numpy())
model = sm.OLS(y_train, X_with_constant)

In [None]:

results = model.fit()
results.params

In [None]:
print(results.summary())
