# Linear Regression with housing dataset
---

In [3]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# import df_LDA
X = pd.read_csv('df_LDA.csv')
C = pd.read_csv('df_LDA.csv')

In [30]:
#drop cplumn class 
X = X.drop(columns = ['class'])

In [31]:
X.head()

Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,5.010635,0.693147,2.302585,0.19062,1.94591,5.902633
1,5.420535,0.693147,3.828641,0.322083,1.098612,5.874931
2,5.01728,1.386294,0.00995,0.00995,0.693147,5.902633
3,4.49981,0.693147,5.602119,1.729884,0.693147,5.273
4,4.394449,2.397895,2.302585,0.09531,0.693147,0.00995


In [50]:
C.corr()

  C.corr()


Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
price,1.0,0.06368,-0.066051,-0.060625,0.04959,0.094194
minimum_nights,0.06368,1.0,-0.221021,-0.294469,0.344036,0.150513
number_of_reviews,-0.066051,-0.221021,1.0,0.758786,-0.095683,0.292486
reviews_per_month,-0.060625,-0.294469,0.758786,1.0,-0.033008,0.367423
calculated_host_listings_count,0.04959,0.344036,-0.095683,-0.033008,1.0,0.341967
availability_365,0.094194,0.150513,0.292486,0.367423,0.341967,1.0


In [7]:
# now we import statsmodel
import statsmodels.api as sm

In [32]:
# Adding a constant to the dataset
X = sm.add_constant(X)

In [33]:
# set the target/ dependent variable called 'y'
y = X['price']
# set independent variable called 'X'
X = X.drop(columns = ['price'])


In [34]:
X

Unnamed: 0,const,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,1.0,0.693147,2.302585,0.190620,1.945910,5.902633
1,1.0,0.693147,3.828641,0.322083,1.098612,5.874931
2,1.0,1.386294,0.009950,0.009950,0.693147,5.902633
3,1.0,0.693147,5.602119,1.729884,0.693147,5.273000
4,1.0,2.397895,2.302585,0.095310,0.693147,0.009950
...,...,...,...,...,...,...
48890,1.0,1.098612,0.009950,0.009950,1.098612,2.302585
48891,1.0,1.609438,0.009950,0.009950,1.098612,3.610918
48892,1.0,2.397895,0.009950,0.009950,0.693147,3.332205
48893,1.0,0.693147,0.009950,0.009950,1.945910,1.098612


In [47]:
# scatter plot of price and minimum_nights


Now, we can create a Python object that will represent _linear regression_:

In [35]:
# create a python object that will represent linear regression model
lin_reg = sm.OLS(y, X)

In [36]:
# check type for lin_reg
type(lin_reg)

statsmodels.regression.linear_model.OLS

In [37]:
# fit the model to the training data
model = lin_reg.fit()
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.021
Model:                            OLS   Adj. R-squared:                  0.021
Method:                 Least Squares   F-statistic:                     206.6
Date:                Tue, 09 May 2023   Prob (F-statistic):          8.86e-219
Time:                        23:58:49   Log-Likelihood:                -51032.
No. Observations:               48895   AIC:                         1.021e+05
Df Residuals:                   48889   BIC:                         1.021e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

### What do these values mean?

The three most important metrics are:
- `Mean absolute error`: The mean absolute error is the sum of absolute differences between predictions and actual values. It is a measure of how well a model predicts the correct values or how wrong the predictions are.
- `Mean squared error`: The mean squared error is the sum of squared differences between predictions and actual values. 
- `R-squared`: This is the goodness of fit of the model to the data. In statistics this is called the coefficient of determination. The value ranges from 0 to 1, where 1 means perfect prediction and 0 means no fit

### Now lets do the same with a sklearn library

In [38]:
# import sklearn
from sklearn.linear_model import LinearRegression

In [39]:
# instantiate the model
regressor = LinearRegression()
regressor.fit(X, y)

We should see a summary like this:

```
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
```

This gives us an overview of the parameters we can set up for _linear regression_ in sklearn. The most important one is `fit_intercept`. In `sklearn`, we don't have to add a constant to a dataset. We have to set this parameter to the value `True` if we want to compute an intercept as well. 

We can check the beta coefficient now:

In [40]:
print (regressor.get_params())

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}


In [41]:
print (regressor.coef_)

[ 0.          0.01237676 -0.02365019 -0.07889321 -0.00727699  0.03800326]


This will show us a NumPy array with beta coefficients. They have the same order as our columns in X.

We can see that the results look much nicer in the `statsmodel` package. Another huge disadvantage of `sklearn` is that we don't have access to p-values, so we cannot check the importance of different variables for prediction.

If we want to know the R-squared, we can get it with:

In [42]:
regressor.score(X, y,)

0.020693208665440777

## Strech: Turning this into a classification model

In [5]:
C.head()

Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,class
0,5.010635,0.693147,2.302585,0.19062,1.94591,5.902633,medium
1,5.420535,0.693147,3.828641,0.322083,1.098612,5.874931,expensive
2,5.01728,1.386294,0.00995,0.00995,0.693147,5.902633,medium
3,4.49981,0.693147,5.602119,1.729884,0.693147,5.273,medium
4,4.394449,2.397895,2.302585,0.09531,0.693147,0.00995,cheap


In [10]:
import copy

In [11]:
W = copy.deepcopy(C)

In [12]:
# split dataframe W into training and testing data this is the function that is used to split the data into training and testing data

def split_dataframe(df, test_size=0.2):
    msk = np.random.rand(len(df)) < test_size
    train = df[msk]
    test = df[~msk]
    return train, test

In [14]:
W_train, W_test = split_dataframe(W)

In [17]:
W_train.head(3)

Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,class
1,5.420535,0.693147,3.828641,0.322083,1.098612,5.874931,expensive
2,5.01728,1.386294,0.00995,0.00995,0.693147,5.902633,medium
4,4.394449,2.397895,2.302585,0.09531,0.693147,0.00995,cheap


In [18]:
# prepare dataset for prediction
X = W_train.drop(columns = ['price', 'class'])
y = W_train['class']

In [19]:
# import random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [20]:
# instanciate the module
model = RandomForestClassifier()

In [21]:
# train the model
model.fit(X, y)

In [26]:
# evaluate the model
score = model.score(X, y)

In [27]:
score

0.8937615812229771

##### Making predictions

In [28]:
W_test.head(3)

Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,class
0,5.010635,0.693147,2.302585,0.19062,1.94591,5.902633,medium
3,4.49981,0.693147,5.602119,1.729884,0.693147,5.273,medium
5,5.303305,1.386294,4.317488,0.463734,0.693147,4.867534,expensive


In [29]:
X_test = W_test.drop(columns = ['price', 'class'])

In [30]:
X_test.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,0.693147,2.302585,0.19062,1.94591,5.902633
3,0.693147,5.602119,1.729884,0.693147,5.273
5,1.386294,4.317488,0.463734,0.693147,4.867534
6,3.828641,3.912023,0.336472,0.693147,0.00995
7,1.098612,6.066108,1.497388,0.693147,5.398163


In [32]:
y_test = W_test['class']

In [31]:
# Predictions
predictions = model.predict(X_test)

In [33]:
model.score(X_test, y_test)

0.4309741966769608

In [34]:
# our model got only 43% accuracy

In [None]:
# 