In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score,explained_variance_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.linear_model import LinearRegression
from math import sqrt
import warnings
warnings.filterwarnings("ignore")
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

# Import data acquisition and prep
from acquire import get_data
from prep import prep_data

#import our scripts that assist data science workflow
import split_scale
import features
# import evaluate

In [2]:
# This cell acquires and prepares the data
# Get the raw data from .csv or MySQL query
raw = get_data()

# Remove nulls
df = prep_data(raw)

Reading data from ./zillow.csv


In [3]:
train, test = split_scale.split_my_data(df)

In [4]:
columns_to_drop = ["taxvalue"]

In [5]:
scaler, train_scaled, test_scaled = split_scale.standard_scaler(train.drop(columns=columns_to_drop),test.drop(columns=columns_to_drop))

In [6]:
print(train_scaled)
print(test_scaled)

       bathrooms  bedrooms      sqft
4432    0.156363  0.782807  0.180526
6689    0.156363 -0.187296 -0.171848
14953  -0.341933 -1.157398 -0.406079
14879   0.654659  0.782807  0.335652
7771   -0.341933 -1.157398 -0.212941
...          ...       ...       ...
13565   0.654659 -0.187296 -0.154384
7863   -0.341933 -0.187296 -0.583807
15512  -0.341933 -0.187296 -0.124591
17876  -0.341933 -0.187296 -0.712223
15862  -1.338524 -1.157398 -1.205341

[14570 rows x 3 columns]
       bathrooms  bedrooms      sqft
3712    0.156363  0.782807 -0.047541
12581  -1.338524 -1.157398 -0.353685
16966  -1.338524 -1.157398 -1.011176
13563  -0.341933  0.782807 -0.173903
11764   0.654659 -0.187296  0.835962
...          ...       ...       ...
4162    1.651250 -0.187296  6.382511
14293   0.654659 -0.187296  0.888355
17782   1.651250  0.782807  2.540300
17930  -1.338524 -1.157398 -1.019395
970    -0.341933 -0.187296 -0.018776

[3643 rows x 3 columns]


In [7]:
X_train = train_scaled
y_train = train[["taxvalue"]]
X_test = test_scaled
y_test = test[["taxvalue"]]

In [8]:
# Select the optimal number of features:
optimal_number = features.optimal_number_of_features(X_train, y_train)
optimal_number = optimal_number[0]
print("The optimal number of features is", optimal_number)

The optimal number of features is 3


In [9]:
features.optimal_features(X_train, X_test, y_train, optimal_number)

(Index(['bathrooms', 'bedrooms', 'sqft'], dtype='object'),
        bathrooms  bedrooms      sqft
 0       0.156363  0.782807  0.180526
 1       0.156363 -0.187296 -0.171848
 2      -0.341933 -1.157398 -0.406079
 3       0.654659  0.782807  0.335652
 4      -0.341933 -1.157398 -0.212941
 ...          ...       ...       ...
 14565   0.654659 -0.187296 -0.154384
 14566  -0.341933 -0.187296 -0.583807
 14567  -0.341933 -0.187296 -0.124591
 14568  -0.341933 -0.187296 -0.712223
 14569  -1.338524 -1.157398 -1.205341
 
 [14570 rows x 3 columns],
       bathrooms  bedrooms      sqft
 0      0.156363  0.782807 -0.047541
 1     -1.338524 -1.157398 -0.353685
 2     -1.338524 -1.157398 -1.011176
 3     -0.341933  0.782807 -0.173903
 4      0.654659 -0.187296  0.835962
 ...         ...       ...       ...
 3638   1.651250 -0.187296  6.382511
 3639   0.654659 -0.187296  0.888355
 3640   1.651250  0.782807  2.540300
 3641  -1.338524 -1.157398 -1.019395
 3642  -0.341933 -0.187296 -0.018776
 
 [3643 row

In [10]:
# First Model
model = LinearRegression()
model.fit(X_train, y_train)

model.predict(X_train)
predictions = model.predict(X_train)
predictions = predictions.flatten()

In [11]:
compare_model = pd.DataFrame()

compare_model["actual"] = y_train.taxvalue
compare_model["predicted"] = predictions
compare_model["baseline"] = y_train.mean()[0]

In [12]:
compare_model.head()

Unnamed: 0,actual,predicted,baseline
4432,585076.0,492481.535711,518416.357172
6689,498687.0,475954.642133,518416.357172
14953,348230.0,487874.272191,518416.357172
14879,801085.0,588275.484738,518416.357172
7771,60425.0,575683.328057,518416.357172


In [13]:
def modeling_function(X_train,X_test,y_train,y_test):
    predictions_train=pd.DataFrame({'actual':y_train.taxvalue}).reset_index(drop=True)
    predictions_test=pd.DataFrame({'actual':y_test.taxvalue}).reset_index(drop=True)

    #model 1
    lm1=LinearRegression()
    lm1.fit(X_train,y_train)
    lm1_predictions=lm1.predict(X_train)
    predictions_train['lm1']=lm1_predictions

    #model 2
    lm2=LinearRegression()
    lm2.fit(X_test,y_test)
    lm2_predictions=lm2.predict(X_test)
    predictions_test['lm2']=lm2_predictions
    
    return predictions_train,predictions_test

In [16]:
model_train, model_test=modeling_function(X_train, X_test, y_train, y_test)

Unnamed: 0,actual,lm2
0,313298.0,3.906272e+05
1,386054.0,4.781917e+05
2,695000.0,1.485311e+05
3,262489.0,3.036400e+05
4,1040630.0,1.013216e+06
5,357258.0,2.983272e+05
6,52960.0,6.360587e+04
7,68503.0,1.882375e+04
8,1182533.0,1.434017e+06
9,463321.0,5.379115e+05


NameError: name 'X_train1' is not defined