# Data Loading And Cleaning

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

In [None]:
file_name='https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/coursera/project/kc_house_data_NaN.csv'
df=pd.read_csv(file_name)
df.head()

In [None]:
df.dtypes

In [None]:
df.drop(['id','Unnamed: 0'],axis=1,inplace=True)
df.describe()

In [None]:
null=df.isnull()
for c in null.columns:
    print(null[c].value_counts())
    

In [None]:
null=df.isnull().sum()
null

In [None]:
bedmean=df.bedrooms.mean()
df.replace(np.nan,bedmean,inplace=True)
bathmean=df.bathrooms.mean()
df.replace(np.nan,bathmean,inplace=True)
null=df.isnull().sum()
null

# Data Wrangling

In [None]:
df.floors.value_counts()

In [None]:
sns.boxplot(data=df,x="waterfront",y="price")

In [None]:
df.corr()['price'].sort_values()

In [None]:
sns.heatmap(df.corr(),cmap='RdBu')

In [None]:
sns.regplot(x="sqft_living", y='price',data=df)

In [None]:
sns.regplot(x="grade", y='price',data=df)

# Model 1 - Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
Z=df[["floors", "waterfront","lat" ,"bedrooms" ,"sqft_basement" ,"view" ,"bathrooms","sqft_living15","sqft_above","grade","sqft_living"]]
X=df[["sqft_living"]]
y=df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=60)

In [None]:
reg=LinearRegression()
model1=reg.fit(X_train, y_train)

In [None]:
model1.score(X_train, y_train)

In [None]:
model1.score(X_test, y_test)

In [None]:
coef=model1.coef_
intercept=model1.intercept_
coef

In [None]:
intercept

In [None]:
sns.residplot(df['sqft_living'],df['price'])

In [None]:
yhat1=model1.predict(X)
yhat1

In [None]:
axl=sns.distplot(df['price'],hist=False,color="r",label="Original Price")
sns.distplot(yhat1,hist=False,color="b",label="Predicted Price",ax=axl)

# Model 2 - Multiple Regression

In [None]:
from sklearn.preprocessing import StandardScaler,PolynomialFeatures

In [None]:
scale=StandardScaler()
scale.fit(X)
x_scale=scale.transform(Z)

In [None]:
x_scale

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_scale, y, test_size=0.2, random_state=60)
reg=LinearRegression()
model2=reg.fit(X_train, y_train)

In [None]:
model2.score(X_train,y_train)

In [None]:
model2.score(X_test,y_test)

In [None]:
yhat2=model2.predict(x_scale)
axl=sns.distplot(df['price'],hist=False,color="r",label="Original Price")
sns.distplot(yhat2,hist=False,color="b",label="Predicted Price",ax=axl)

# Model 3 - Polynomial Regression

In [None]:
train_score=[]
test_score=[]
degree=np.arange(1,10,1)
for d in degree:
    pr=PolynomialFeatures(degree=d)
    X_pr=pr.fit_transform(x_scale)
    X_train, X_test, y_train, y_test = train_test_split(X_pr, y, test_size=0.2, random_state=60)
    reg=LinearRegression()
    model3=reg.fit(X_train, y_train)
    train_score.append(["degree= "+str(degree),model3.score(X_train,y_train)])
    test_score.append(["degree= "+str(degree),model3.score(X_test,y_test)])
    

In [None]:
train_score

In [None]:
test_score

In [None]:
#degree for best model is d=3
d=3
pr=PolynomialFeatures(degree=d)
X_pr=pr.fit_transform(x_scale)
X_train, X_test, y_train, y_test = train_test_split(X_pr, y, test_size=0.2, random_state=60)
reg=LinearRegression()
model3=reg.fit(X_train, y_train)

In [None]:
model3.score(X_pr,y)

In [None]:
yhat3=model3.predict(X_pr)
yhat3

In [None]:
axl=sns.distplot(df['price'],hist=False,color="r",label="Original Price")
sns.distplot(yhat3,hist=False,color="b",label="Predicted Price",ax=axl)

# Model 4 - Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score

In [None]:
train_score=[]
test_score=[]
alpha=[0.001,0.01 , 0.1 , 1 , 5 , 10 , 50 , 100]
for a in alpha:
    X_train, X_test, y_train, y_test = train_test_split(x_scale, y, test_size=0.2, random_state=60)
    m4=Lasso(a)
    model4=m4.fit(X_train, y_train)
    train_score.append(["alpha= "+str(a),model4.score(X_train,y_train)])
    test_score.append(["alpha= "+str(a),model4.score(X_test,y_test)])

In [None]:
train_score

In [None]:
test_score

In [None]:
# best alpha = 0.001
a=0.001
m4=Lasso(a)
model4=m4.fit(X_train, y_train)
yhat4=model4.predict(x_scale)
model4.score(X_test,y_test)

In [None]:
axl=sns.distplot(df['price'],hist=False,color="r",label="Original Price")
sns.distplot(yhat4,hist=False,color="b",label="Predicted Price",ax=axl)

# Model Evaluation 

In [None]:
from sklearn.metrics import mean_squared_error
print("model 1 MSE",mean_squared_error(df['price'], yhat1))
print("model 2 MSE",mean_squared_error(df['price'], yhat2))
print("model 3 MSE",mean_squared_error(df['price'], yhat3))
print("model 4 MSE",mean_squared_error(df['price'], yhat4))

Best Model is 3

In [None]:
scores=cross_val_score(model3,X_pr,y,cv=200000)

In [None]:
print("Mean Scores of Polynomial Model with degree 3 ="+str(scores.mean()))

# Model With Pipeline

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
score=[]
degree=np.arange(1,8,1)
for d in degree:
    Input=[('scale',StandardScaler()),('poly',PolynomialFeatures(degree=d,include_bias=False)),('model',LinearRegression())]
    pipe=Pipeline(Input)
    xtr, xte, ytr, yte = train_test_split(Z, y, test_size=0.2, random_state=60)
    pipe.fit(xtr,ytr)
    score.append(["degree:"+str(d),pipe.score(xte,yte)])

In [None]:
score