In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, PowerTransformer
from sklearn.preprocessing import Imputer
import math

import copy
import argparse

%matplotlib inline

In [28]:
ames_train = pd.read_csv('./datasets/train.csv')
ames_test = pd.read_csv('./datasets/test.csv')
ames_test_1 = pd.read_csv('./datasets/test.csv')
test_id = ames_test['Id'].to_frame()
print(ames_train.shape)
print(ames_test.shape)

(2051, 81)
(879, 80)


In [None]:
Sale = np.log(ames_train['SalePrice'])

In [29]:
Sale = (ames_train['SalePrice'])

In [30]:
ames_train['home_age'] = ames_train['Yr Sold'] - ames_train['Year Built']
ames_test['home_age'] = ames_test['Yr Sold'] - ames_test['Year Built']
ames_train['base_1st'] = ames_train['Total Bsmt SF'] * ames_train['1st Flr SF']
ames_test['base_1st'] = ames_test['Total Bsmt SF'] * ames_test['1st Flr SF']
ames_train['quality_year'] = ames_train['Overall Qual'] * ames_train['Year Built']
ames_test['quality_year'] = ames_test['Overall Qual'] * ames_test['Year Built']
ames_train['quality_sq_ft'] = ames_train['Overall Qual'] * ames_train['1st Flr SF']
ames_test['quality_sq_ft'] = ames_test['Overall Qual'] * ames_test['1st Flr SF']
ames_train['garage_area_cars'] = ames_train['Garage Area'] * ames_train['Garage Cars']
ames_test['garage_area_cars'] = ames_test['Garage Area'] * ames_test['Garage Cars']
ames_train['qual_neig'] = ames_train['Neighborhood'] * ames_train['Overall Qual']
ames_test['qual_neig'] = ames_test['Neighborhood'] * ames_test['Overall Qual']
ames_train['Quality_sq'] = ames_train['Overall Qual'] * ames_train['Overall Qual']
ames_test['Quality_sq'] = ames_test['Overall Qual'] * ames_test['Overall Qual']
ames_train['GLA_OQ'] = ames_train['Gr Liv Area'] * ames_train['Overall Qual']
ames_test['GLA_OQ'] = ames_test['Gr Liv Area'] * ames_test['Overall Qual']
ames_train['GLA_SQ'] = ames_train['Gr Liv Area'] * ames_train['Gr Liv Area']
ames_test['GLA_SQ'] = ames_test['Gr Liv Area'] * ames_test['Gr Liv Area']
ames_train['Yr_brm'] = ames_train['Year Built'] * ames_train['Year Remod/Add']
ames_test['Yr_brm'] = ames_test['Year Built'] * ames_test['Year Remod/Add']
ames_train['GLA_Bath'] = ames_train['Gr Liv Area'] * ames_train['Full Bath']
ames_test['GLA_Bath'] = ames_test['Gr Liv Area'] * ames_test['Full Bath']
ames_train['GLA_Garage'] = ames_train['Gr Liv Area'] * ames_train['garage_area_cars']
ames_test['GLA_Garage'] = ames_test['Gr Liv Area'] * ames_test['garage_area_cars']

In [31]:
# ames_train._get_numeric_data
ames_test.fillna(ames_test.mean(), inplace=True)
ames_test.fillna('no_value_reported', inplace=True)

In [32]:
tested_col = ['GLA_OQ',
            'Overall Qual',
            'Year Built',
            'Total Bsmt SF',             
            '1st Flr SF',
            'Gr Liv Area',
            'Full Bath',
            'TotRms AbvGrd',
            'Quality_sq',
            'quality_year',
            'GLA_Garage'
             
             ]         
            
    


In [33]:
log_col = ['quality_sq_ft',]  

In [34]:
def get_log_1(df):
    return df.applymap(
        lambda x: math.log(x) if x != 0 else 0)

def log_num_col(df, log_col, tested_col):
    df_select = pd.DataFrame()
    for col in tested_col:
        print(col)
        if col not in log_col:
            df_select = pd.concat([df_select, df[col]], axis=1)
        else:
            df_select[col] = df[col].apply(lambda x: math.log(x) if x != 0 else 0)
    return df_select

In [35]:
ames_train = ames_train[tested_col]
ames_test = ames_test[tested_col]
ames_train['SalePrice'] = Sale

In [36]:
def dropped_cols(df, tested_col):
    dropped_col = []
    for col in df:
        if col not in tested_col:
            dropped_col.append(col)
    return dropped_col  


def dummy_vars_list(df, tested_cols):
    dummy_vars_list = []
    for col in tested_cols:
        if df[col].dtype.name == 'object':
            dummy_vars_list.append(col)
    return dummy_vars_list            
    


def get_dummies(train, test, columns, drop_first=True,
                inplace=False):
    if not inplace:
        train = copy.deepcopy(train)
        test = copy.deepcopy(test)
        

    for column in columns:
        train_levels = set(train[column])
        test_levels = set(test[column])
        all_levels = sorted(train_levels.union(test_levels))
        if drop_first:
            all_levels = all_levels[1:]
        for level in all_levels:
            dummy_name = "%s_is_%s" % (column, level)
            train[dummy_name] = (train[column] == level) # Could be replaced with an if statement
            test[dummy_name] = (test[column] == level)
    train.drop(columns=columns, inplace=True)
    test.drop(columns=columns, inplace=True)

    
    # return only necessary if inplace=False
    return (train, test)

def check_compatibility(train, test):
    # assume compatible unless one of these checks fails
    if not (len(train.columns) == len(test.columns)):
        return False
    for column in train.columns:
        if not (column in test.columns):
            return False
    return True


def clean_data(train, test, columns_to_drop,
               columns_for_dummies, remove_na=True,
               drop_first=True, inplace=False):
    if not inplace:
        train = copy.deepcopy(train)
        test = copy.deepcopy(test)
  
    # inplace=True because if user called with inplace=False,
    # we already made a copy and aren't modifying his/her original
    train.drop(columns=columns_to_drop, inplace=True)
    test.drop(columns=columns_to_drop, inplace=True)
  

#     same reason as above for inplace=True
    if remove_na:
        train.dropna(inplace=True)
        test.dropna(inplace=True)


    (train, test) = get_dummies(train, test,
                                columns=columns_for_dummies,
                                drop_first=drop_first,
                                inplace=inplace)
    

    return (train, test)


###################

def patrick_clean_data(train_df, test_df, variables, remove_na=True, drop_first=True, inplace=False):
    columns_to_drop = dropped_cols(test_df, variables)
    dummy_columns = dummy_vars_list(test_df, variables)
    new_train, new_test = clean_data(train_df, test_df, 
                                     columns_to_drop, 
                                     dummy_columns, 
                                     remove_na=remove_na, 
                                     drop_first=drop_first, 
                                     inplace=inplace)
    
    y_col_name = []
    for col in new_train.columns:
        if col not in new_test.columns:
            y = new_train[col]
            y_col_name = col
    new_train = new_train.drop(y_col_name, axis=1)
    
    assert check_compatibility(new_train, new_test)
    
#     for column in new_test.columns: # Turn this on if I want to see what columns were made
#         print(column)

    return new_train, new_test, y

In [37]:
new_train, new_test, y = patrick_clean_data(ames_train, ames_test, tested_col)
print(new_test.shape)
print(new_train.shape)
print(y.shape)


(879, 11)
(2049, 11)
(2049,)


In [38]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

# Fit and transform our X data.
poly_train = poly.fit_transform(new_train)
poly_test = poly.fit_transform(new_test)


In [79]:
poly_train.shape

(2049, 77)

In [39]:
ss = StandardScaler()
X_sc = ss.fit_transform(poly_train)
test_sc = ss.transform(poly_test)
model = LinearRegression()
lasso = LassoCV(n_alphas=1000, cv=5)    
ridge = RidgeCV(cv=5)
model.fit(X_sc, y)
lasso.fit(X_sc, y)
ridge.fit(X_sc, y)
model_pred = model.predict(X_sc)
lasso_pred = lasso.predict(X_sc)
ridge_pred = ridge.predict(X_sc)





In [53]:
lasso.coef_

array([  9085.12326496,   1386.09400698,  10327.95914961,    933.56815503,
        10757.39135303,   7740.21610679,     -0.        ,     -0.        ,
           -0.        ,   3510.66433075,      0.        , -20222.09650403,
         6035.54322087,  12246.19856847,  -2136.44750477,  -5238.09123932,
           -0.        ,  48169.26602829,      0.        ,      0.        ,
            0.        , -20973.24674768,     -0.        ,      0.        ,
            0.        ,    876.11845862,      0.        , -14195.1024952 ,
            0.        ,  -1479.46677755,     -0.        ,      0.        ,
         1318.28735171,  10168.29487057,   2868.37975096,   3305.92681281,
           -0.        ,     -0.        ,     -0.        ,      0.        ,
            0.        , -13679.82479145, -15527.05236169,      0.        ,
            0.        ,     -0.        ,  42576.27292765,      0.        ,
        -3214.93670843,    693.40354745,      0.        ,      0.        ,
       -18512.61168374,  

In [88]:
coef_df = pd.DataFrame({'variables':poly.get_feature_names(tested_col)})
coe = pd.DataFrame({'coeffieients':lasso.coef_})
coef_df = pd.concat([coef_df, coe], axis=1)
values = pd.DataFrame(poly_train, index=list(range(0,2049)), columns=coef_df['variables'])
values['SalePrice'] = Sale
values.head()

variables,GLA_OQ,Overall Qual,Year Built,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,TotRms AbvGrd,Quality_sq,quality_year,...,TotRms AbvGrd Quality_sq,TotRms AbvGrd quality_year,TotRms AbvGrd GLA_Garage,Quality_sq^2,Quality_sq quality_year,Quality_sq GLA_Garage,quality_year^2,quality_year GLA_Garage,GLA_Garage^2,SalePrice
0,8874.0,6.0,1976.0,725.0,725.0,1479.0,2.0,6.0,36.0,11856.0,...,216.0,71136.0,8430300.0,1296.0,426816.0,50581800.0,140564736.0,16658270000.0,1974166000000.0,130500
1,14854.0,7.0,1996.0,913.0,913.0,2122.0,2.0,8.0,49.0,13972.0,...,392.0,111776.0,18979168.0,2401.0,684628.0,116247404.0,195216784.0,33147120000.0,5628263000000.0,220000
2,5285.0,5.0,1953.0,1057.0,1057.0,1057.0,1.0,5.0,25.0,9765.0,...,125.0,48825.0,1300110.0,625.0,244125.0,6500550.0,95355225.0,2539115000.0,67611440000.0,109000
3,7220.0,5.0,2006.0,384.0,744.0,1444.0,2.0,7.0,25.0,10030.0,...,175.0,70210.0,8086400.0,625.0,250750.0,28880000.0,100600900.0,11586660000.0,1334487000000.0,174000
4,8670.0,6.0,1900.0,676.0,831.0,1445.0,2.0,6.0,36.0,11400.0,...,216.0,68400.0,8392560.0,1296.0,410400.0,50355360.0,129960000.0,15945860000.0,1956530000000.0,138500


In [90]:
values.corr()[['SalePrice']].sort_values('SalePrice', ascending=False).head(77)

variables,SalePrice
variables,Unnamed: 1_level_1
SalePrice,1.000000
GLA_OQ quality_year,0.533429
Quality_sq^2,0.531977
Gr Liv Area Quality_sq,0.531523
GLA_OQ Overall Qual,0.531523
Quality_sq quality_year,0.530648
Overall Qual Quality_sq,0.529412
GLA_OQ Quality_sq,0.527249
TotRms AbvGrd Quality_sq,0.524336
1st Flr SF Quality_sq,0.523429


In [74]:
coef_df.head()
# coef_df.sort_values(by='coeffieients', ascending=False)

Unnamed: 0,variables,coeffieients
0,GLA_OQ,9085.123265
1,Overall Qual,1386.094007
2,Year Built,10327.95915
3,Total Bsmt SF,933.568155
4,1st Flr SF,10757.391353


In [None]:
def metrics_summary(X, y, test_set, cv, k, scaled=True, poly=False, n_alphas=100):
    
#     if poly:
#         poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
#         poly_train = poly.fit_transform(X)
#         poly_test = poly.fit_transform(new_test)
    
    # Scaling our data
    ss = StandardScaler()
    X_sc = ss.fit_transform(X)
    test_sc = ss.transform(test_set)

    # Instantiating our models
    model = LinearRegression()
    lasso = LassoCV(n_alphas=n_alphas, cv=5)    
    ridge = RidgeCV(cv=5)

    # Fitting our models on non-scaled and scaled data
    model.fit(X,y)
    model.fit(X_sc, y)
    lasso.fit(X_sc, y)
    ridge.fit(X_sc, y)

    # Running the predictions on our model
    model_p = model.predict(X)
    model_pred = model.predict(X_sc)
    lasso_pred = lasso.predict(X_sc)
    ridge_pred = ridge.predict(X_sc)

    resids_lr = y - model_p
    resids_lr_sc = y - model_pred
    resids_lasso = y - lasso_pred
    resids_ridge = y - ridge_pred

    rss_lr = (resids_lr ** 2).sum()
    rss_lr_sc = (resids_lr_sc ** 2).sum()
    rss_lasso = (resids_lasso ** 2).sum()
    rss_ridge = (resids_ridge ** 2).sum()

    r_squared = (metrics.r2_score(y, model_p))
    adj_r2 = 1 - (1-r_squared)*((len(y)-1)/(len(y)-X.shape[k]-1))

    # Running cross value scores
    cvs = cross_val_score(model, X_sc, y, cv=cv).mean()
    cvs_l = cross_val_score(lasso, X_sc, y, cv=cv).mean()
    cvs_r = cross_val_score(ridge, X_sc, y, cv=cv).mean()

    print(f'RSS (Residual Sum of Squares) : {(rss_lr)}')
    print(f'MAE (Mean Absolute Error) : {(metrics.mean_absolute_error(y, model_p))}') 
    print(f'MSE (Mean Square Error) : {(metrics.mean_squared_error(y, model_p))}')
    print()
    print(f'Root MSE (Root Meen Square Error Linear Non-Scaled) : {(np.sqrt(rss_lr / len(model_p)))}')
    print(f'Root MSE (Root Meen Square Error Linear Scaled) : {(np.sqrt(rss_lr_sc / len(model_pred)))}')
    print(f'Root MSE (Root Meen Square Error Lasso) : {(np.sqrt(rss_lasso / len(lasso_pred)))}')
    print(f'Root MSE (Root Meen Square Error Ridge) : {(np.sqrt(rss_ridge / len(ridge_pred)))}')
    print()
    print(f'R\u00b2 : {(r_squared)}')
    print(f'Adjusted R\u00b2 : {(adj_r2)}')
    print()
    print(f'CVS (Cross Value Score LR) : {(cvs)}')
    print(f'CVS (Cross Value Score Lasso) : {(cvs_l)}')
    print(f'CVS (Cross Value Score Ridge) : {(cvs_r)}')

    model.fit(X_sc, y)
    model_pred_final = model.predict(test_sc)
    lasso.fit(X_sc, y)
    lasso_pred_final = lasso.predict(test_sc)
    ridge.fit(X_sc, y)
    ridge_pred_final = ridge.predict(test_sc)

    if (cvs > cvs_l) and (cvs > cvs_r):
        prediction = pd.DataFrame({'Id': test_id['Id'], 'SalePrice': model_pred_final}).to_csv('wd_prediction_17.csv', index=False)
    elif (cvs_l > cvs) and (cvs_l > cvs_r):
        prediction = pd.DataFrame({'Id': test_id['Id'], 'SalePrice': lasso_pred_final}).to_csv('wd_prediction_17.csv', index=False)
    elif (cvs_r > cvs_l) and (cvs_r > cvs):
        prediction = pd.DataFrame({'Id': test_id['Id'], 'SalePrice': ridge_pred_final}).to_csv('wd_prediction_17.csv', index=False)
    return prediction, ridge_pred_final, lasso_pred_final, model_pred_final


In [None]:
prediction, ridge, lasso, linear = metrics_summary(new_train, y, new_test, 5, 1, scaled=True, poly=False)

In [None]:
prediction = np.exp(ridge)

In [None]:
prediction = np.exp(lasso)

In [None]:
prediction = pd.DataFrame({'Id': test_id['Id'], 'SalePrice': prediction}).to_csv('wd_prediction_28.csv', index=False)


In [None]:
prediction = np.exp(linear)