In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score

%matplotlib inline

In [2]:
s_train = pd.read_csv('datasets/clean_train.csv')
s_test = pd.read_csv('datasets/clean_test.csv')

In [3]:
s_train['gr_liv_area'].isnull().sum()

0

In [19]:
s_train['overall_cond'].isnull().sum()

0

In [None]:
s_train[''].isnull().sum()

In [18]:
s_train['exter_qual'].isnull().sum()

0

In [4]:
s_train['garage_area'].isnull().sum()

1

In [5]:
s_train['garage_area'].fillna(0)

0       475.0
1       559.0
2       246.0
3       400.0
4       484.0
        ...  
2046    520.0
2047    539.0
2048    342.0
2049    294.0
2050    442.0
Name: garage_area, Length: 2051, dtype: float64

In [6]:
s_train['garage_area'].isnull().sum()

1

In [7]:
#Trying out a very small model first. 
features = ['overall_qual', 'overall_cond', 'exter_qual', 'gr_liv_area', 'garage_area']
X = s_train[features]
y = s_train['saleprice']
X_test_data = s_test[features]
print(X.describe())
print(y.describe())

       overall_qual  overall_cond   exter_qual  gr_liv_area  garage_area
count   2051.000000   2051.000000  2051.000000  2051.000000  2050.000000
mean       6.112140      5.562165     2.593857  1499.330083   473.671707
std        1.426271      1.104497     0.587962   500.447829   215.934561
min        1.000000      1.000000     1.000000   334.000000     0.000000
25%        5.000000      5.000000     2.000000  1129.000000   319.000000
50%        6.000000      5.000000     3.000000  1444.000000   480.000000
75%        7.000000      6.000000     3.000000  1728.500000   576.000000
max       10.000000      9.000000     4.000000  5642.000000  1418.000000
count      2051.000000
mean     181469.701609
std       79258.659352
min       12789.000000
25%      129825.000000
50%      162500.000000
75%      214000.000000
max      611657.000000
Name: saleprice, dtype: float64


In [21]:
s_train[features].isnull().sum()

overall_qual    0
overall_cond    0
exter_qual      0
gr_liv_area     0
garage_area     1
dtype: int64

In [8]:
X.isnull().sum()

overall_qual    0
overall_cond    0
exter_qual      0
gr_liv_area     0
garage_area     1
dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=76)

In [10]:
# plt.figure(figsize=(15,15))
# sns.heatmap(s_train.corr(), annot=True)

In [11]:
sc = StandardScaler()
X_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [12]:
X_scaled.shape

(1640, 5)

In [13]:
type(X_scaled)

numpy.ndarray

In [14]:
X_test_scaled.shape

(411, 5)

In [15]:
X_train.corr(),s_train['saleprice'].sort_values()

(              overall_qual  overall_cond  exter_qual  gr_liv_area  garage_area
 overall_qual      1.000000     -0.104604   -0.742899     0.561730     0.559597
 overall_cond     -0.104604      1.000000    0.192108    -0.127247    -0.137195
 exter_qual       -0.742899      0.192108    1.000000    -0.433046    -0.522284
 gr_liv_area       0.561730     -0.127247   -0.433046     1.000000     0.489103
 garage_area       0.559597     -0.137195   -0.522284     0.489103     1.000000,
 1628     12789
 183      13100
 1309     34900
 1292     35000
 1518     35311
          ...  
 151     556581
 1796    582933
 1692    584500
 1964    591587
 1671    611657
 Name: saleprice, Length: 2051, dtype: int64)

In [16]:
lr = LinearRegression()

In [17]:
lr.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
lr_scores = cross_val_score(lr, X_scaled, y_train, cv=3)
lr_scores.mean()

In [None]:
lasso = LassoCV(n_alphas=30)

In [None]:
lasso.fit(X_scaled, y_train)

In [None]:
lasso.alpha_

In [None]:
np.shape(lasso.coef_)

In [None]:
lasso_scores = cross_val_score(lasso, X_test_scaled, y_test, cv=3)
lasso_scores.mean()

### Testing with Ridge Model ###

In [None]:
ridge = RidgeCV(alphas=np.linspace(.1, 10, 100))

In [None]:
ridge_scores = cross_val_score(ridge, X_train, y_train, cv=3)
ridge_scores.mean()

In [None]:
ridge.fit(X_train, y_train)

In [None]:
ridge.score(X_train, y_train)

In [None]:
ridge.score(X_test, y_test)

In [None]:
ridge_scores.mean()

In [None]:
pred = ridge.predict(X_test)
pred[:10]

In [None]:
residuals = y_test - pred

In [None]:
residuals

In [None]:
pred[:10]

In [None]:
r2_score(y_test, pred)

In [None]:
pd.Series(ridge.coef_, index=features).plot.bar(figsize=(15, 7))

In [None]:
plt.scatter(pred, residuals)

In [None]:
ridge_df = pd.DataFrame([ridge.coef_, features], index = ['Coefficients', 'Features']).T
ridge_df = ridge_df.set_index('Features')
ridge_df['Coefficients'] = np.abs(ridge_df['Coefficients'])
ridge_df = ridge_df.sort_values(by='Coefficients',ascending=False)
ridge_df.head(10)

In [None]:
plt.barh(y=ridge_df.index[:10], width=ridge_df['Coefficients'][0:10]);
plt.title('Major Contributing Features')
plt.xlabel('Strenth of Predictor')

In [None]:
pred = ridge.predict(X_test)
residuals = y_test - pred
residuals[:5]  #look at the first fiveID

In [None]:
plt.hist(residuals);
plt.title('Distribution of Residuals')

In [None]:
sample_prices = y_test.values[0]
np.log(sample_prices)

In [None]:
plt.hist(np.log(y_test));
plt.title('Transformed Residuals')

In [None]:
#interdcept 182000
#slope for quality is 19500
plt.scatter(x=pred,y=residuals)
plt.xlabel('Predicted Home Prices')
plt.ylabel('Residuals')
plt.title('Comparing Predicted Prices with Residuals')

# Now with Test data #

In [None]:
s_test.shape

In [None]:
X_test_data.head()

In [None]:
X_test_data.corr(),s_train['saleprice'].sort_values()

In [None]:
test_pred = ridge.predict(X_test_data)

In [None]:
test_pred[:10]

In [None]:
s_test['saleprice'] = test_pred

In [None]:
s_test.head()

In [None]:
y_test_data = s_test['saleprice']

In [None]:
# Scale test set
X_td_scaled = sc.transform(X_test_data)
# y_td_scaled = sc.transform(y_test_data)

In [None]:
X_td_scaled.shape

In [None]:
ridge = RidgeCV(alphas=np.linspace(.1, 10, 100)).fit(X_td_scaled, y_test_data)
ridge

In [None]:
ridge_scores = cross_val_score(ridge, X_train, y_train, cv=3)
ridge_scores.mean()

In [None]:
ridge.fit(X_train, y_train)

In [None]:
ridge.score(X_train, y_train)

In [None]:
ridge.score(X_test, y_test)

In [None]:
ridge.score(X_td_scaled, y_test_data)

In [None]:
s_test.columns

In [None]:
s_test.rename(columns={'id': 'ID', 'saleprice': 'SalePrice'}, errors="raise", inplace=True)

In [None]:
submission = s_test[['ID', 'SalePrice']]

In [None]:
submission.to_csv('datasets/submission_2.csv', index=False)