In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# Import the data
df = pd.read_csv("https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_3/datasets/rent-data-label-encoded.csv")
df.head()

Unnamed: 0,category,bathrooms,bedrooms,fee,has_photo,pets_allowed,price,price_type,square_feet,cityname,...,Gated,TV,Hot Tub,Tennis,Wood Floors,View,Alarm,Doorman,Luxury,Golf
0,0,,0.0,0,1,3,790,0,101,1468,...,0,0,0,0,0,0,0,0,0,0
1,0,,1.0,0,1,3,425,0,106,449,...,0,0,0,0,0,0,0,0,0,0
2,0,1.0,0.0,0,1,3,1390,0,107,52,...,0,0,0,0,0,0,0,0,0,0
3,0,1.0,0.0,0,1,3,925,0,116,1285,...,0,0,0,0,0,0,0,0,0,0
4,0,,0.0,0,1,3,880,0,125,52,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.shape

(10000, 42)

In [5]:
# Drop rows with missing values
df = df.dropna()

In [6]:
df.shape

(9950, 42)

## Split into training and testing sets

In [7]:
# Make an X variable with all columns except price
X_full = df.drop(columns = ['price'])
X_full.columns

Index(['category', 'bathrooms', 'bedrooms', 'fee', 'has_photo', 'pets_allowed',
       'price_type', 'square_feet', 'cityname', 'state', 'latitude',
       'longitude', 'source', 'time', 'Dishwasher', 'Elevator', 'Patio/Deck',
       'Pool', 'Storage', 'Refrigerator', 'AC', 'Basketball',
       'Cable or Satellite', 'Gym', 'Internet Access', 'Clubhouse', 'Parking',
       'Garbage Disposal', 'Fireplace', 'Washer Dryer', 'Playground', 'Gated',
       'TV', 'Hot Tub', 'Tennis', 'Wood Floors', 'View', 'Alarm', 'Doorman',
       'Luxury', 'Golf'],
      dtype='object')

In [8]:
select_features = ["square_feet", "Gated", "bathrooms", "bedrooms", "has_photo", "Pool", "AC"]

# Create another variable X_sel with only the columns
# in the "select_features" list
X_sel = df[select_features]
X_sel.head()

Unnamed: 0,square_feet,Gated,bathrooms,bedrooms,has_photo,Pool,AC
2,107,0,1.0,0.0,1,0,0
3,116,0,1.0,0.0,1,0,0
5,130,0,1.0,0.0,1,1,0
8,138,0,1.0,0.0,1,0,0
14,190,0,1.0,0.0,1,0,0


In [9]:
# Set the target variable y
y = df['price'].values.reshape(-1, 1)

In [10]:
# Now split the data into training and testing sets
X_full_train, X_full_test, X_sel_train, X_sel_test, y_train, y_test = train_test_split(X_full, X_sel, y, 
                                                                                       random_state=42)

## Train the models

In [11]:
# Create the models
lr1 = LinearRegression()
lr2 = LinearRegression()

# Fit the first model to the full training data. 
lr1.fit(X_full_train, y_train)

# Fit the second model to the select training data.
lr2.fit(X_sel_train, y_train)

## Evaluate the model

In [12]:
# Calculate the mean_squared_error and the r-squared value
# for the testing data

# Use our models to make predictions
predicted1 = lr1.predict(X_full_test)
predicted2 = lr2.predict(X_sel_test)

# Score the predictions with mse and r2
mse1 = mean_squared_error(y_test, predicted1)
mse2 = mean_squared_error(y_test, predicted2)
r2_1 = r2_score(y_test, predicted1)
r2_2 = r2_score(y_test, predicted2)


print(f"All Features:")
print(f"mean squared error (MSE): {mse1}")
print(f"R-squared (R2): {r2_1}")
print("---------------------")
print(f"Select Features:")
print(f"mean squared error (MSE): {mse2}")
print(f"R-squared (R2): {r2_2}")

All Features:
mean squared error (MSE): 517096.7239420295
R-squared (R2): 0.36119653226983583
---------------------
Select Features:
mean squared error (MSE): 576253.8595848796
R-squared (R2): 0.2881158461236345


In [14]:
# Provided code to create the adjusted r-squared function
def r2_adj(x, y, model):
    r2 = model.score(x,y)
    n_cols = x.shape[1]
    return 1 - (1 - r2) * (len(y) - 1) / (len(y) - n_cols - 1)

In [15]:
# Calculate the adjusted r-squared value of the model
adj_score1 = r2_adj(X_full_test, y_test, lr1)
adj_score2 = r2_adj(X_sel_test, y_test, lr2)

print(f"All Features Adjusted R2: {adj_score1}")
print(f"Select Features Adjusted R2: {adj_score2}")

All Features Adjusted R2: 0.35048886989169326
Select Features Adjusted R2: 0.286106495689306


In [16]:
# Examine linear regression on the better training data using cross validation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(LinearRegression(), X_full_train, y_train, scoring='r2')
print(f"All Scores: {cv_scores}")
print(f"Mean Score: {cv_scores.mean()}")
print(f"Standard Deviation: {cv_scores.std()}")

All Scores: [0.2791709  0.33997437 0.16930788 0.40130909 0.4533232 ]
Mean Score: 0.3286170888238094
Standard Deviation: 0.09878165797528618
