In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, PowerTransformer
from sklearn.preprocessing import Imputer

import copy
import argparse

%matplotlib inline

In [4]:
ames_train = pd.read_csv('./datasets/train.csv')
ames_test = pd.read_csv('./datasets/test.csv')
ames_test_1 = pd.read_csv('./datasets/test.csv')
test_id = ames_test['Id'].to_frame()
print(ames_train.shape)
print(ames_test.shape)

(2051, 81)
(879, 80)


In [None]:
ames_train['base_1st'] = ames_train['Total Bsmt SF'] * ames_train['1st Flr SF']
ames_test['base_1st'] = ames_test['Total Bsmt SF'] * ames_test['1st Flr SF']

In [None]:
ames_train['quality_year'] = ames_train['Overall Qual'] * ames_train['Year Built']
ames_test['quality_year'] = ames_test['Overall Qual'] * ames_test['Year Built']

In [None]:
ames_train['quality_sq_ft'] = ames_train['Overall Qual'] * ames_train['1st Flr SF']
ames_test['quality_sq_ft'] = ames_test['Overall Qual'] * ames_test['1st Flr SF']

In [None]:
ames_train['garage_area_cars'] = ames_train['Garage Area'] * ames_train['Garage Cars']
ames_test['garage_area_cars'] = ames_test['Garage Area'] * ames_test['Garage Cars']

In [None]:
ames_train['qual_neig'] = ames_train['Neighborhood'] * ames_train['Overall Qual']
ames_test['qual_neig'] = ames_test['Neighborhood'] * ames_test['Overall Qual']

In [None]:
ames_train['Quality_sq'] = ames_train['Overall Qual'] * ames_train['Overall Qual']
ames_test['Quality_sq'] = ames_test['Overall Qual'] * ames_test['Overall Qual']

In [None]:
ames_train['GLA_OQ'] = ames_train['Gr Liv Area'] * ames_train['Overall Qual']
ames_test['GLA_OQ'] = ames_test['Gr Liv Area'] * ames_test['Overall Qual']

In [None]:
ames_train['GLA_SQ'] = ames_train['Gr Liv Area'] * ames_train['Gr Liv Area']
ames_test['GLA_SQ'] = ames_test['Gr Liv Area'] * ames_test['Gr Liv Area']

In [None]:
ames_train['Yr_brm'] = ames_train['Year Built'] * ames_train['Year Remod/Add']
ames_test['Yr_brm'] = ames_test['Year Built'] * ames_test['Year Remod/Add']

In [None]:
ames_train['GLA_Bath'] = ames_train['Gr Liv Area'] * ames_train['Full Bath']
ames_test['GLA_Bath'] = ames_test['Gr Liv Area'] * ames_test['Full Bath']

In [None]:
# ames_train._get_numeric_data
ames_test.fillna(ames_test.mean(), inplace=True)
ames_test.fillna('no_value_reported', inplace=True)

In [None]:
ames_test.columns

In [14]:
tested_col = [
            'Overall Qual', 'Alley']         
            
    
    
    
    
    
    
    
#     'Year Built',            
#     'Gr Liv Area',            
#     'Neighborhood',            
#     'GLA_SQ',
# 'Quality_sq',
# 'GLA_OQ',
# 'quality_year',
# 'quality_sq_ft',
# 'MS Zoning',
# ]


In [6]:
ames_train.corr()[['SalePrice']].sort_values('SalePrice', ascending=False)

Unnamed: 0,SalePrice
SalePrice,1.0
Overall Qual,0.800207
Gr Liv Area,0.697038
Garage Area,0.65027
Garage Cars,0.64822
Total Bsmt SF,0.628925
1st Flr SF,0.618486
Year Built,0.571849
Year Remod/Add,0.55037
Full Bath,0.537969


In [18]:
def dropped_cols(df, tested_col):
    dropped_col = []
    for col in df:
        if col not in tested_col:
            dropped_col.append(col)
    return dropped_col  


def dummy_vars_list(df, tested_cols):
    dummy_vars_list = []
    for col in tested_cols:
        if df[col].dtype.name == 'object':
            dummy_vars_list.append(col)
    return dummy_vars_list            
    


def get_dummies(train, test, columns, drop_first=True,
                inplace=False):
    if not inplace:
        train = copy.deepcopy(train)
        test = copy.deepcopy(test)
        

    for column in columns:
        train_levels = set(train[column])
        test_levels = set(test[column])
        all_levels = sorted(train_levels.union(test_levels))
        if drop_first:
            all_levels = all_levels[1:]
        for level in all_levels:
            dummy_name = "%s_is_%s" % (column, level)
            train[dummy_name] = (train[column] == level) # Could be replaced with an if statement
            test[dummy_name] = (test[column] == level)
    train.drop(columns=columns, inplace=True)
    test.drop(columns=columns, inplace=True)

    
    # return only necessary if inplace=False
    return (train, test)

def check_compatibility(train, test):
    # assume compatible unless one of these checks fails
    if not (len(train.columns) == len(test.columns)):
        return False
    for column in train.columns:
        if not (column in test.columns):
            return False
    return True


def clean_data(train, test, columns_to_drop,
               columns_for_dummies, remove_na=True,
               drop_first=True, inplace=False):
    if not inplace:
        train = copy.deepcopy(train)
        test = copy.deepcopy(test)
  
    # inplace=True because if user called with inplace=False,
    # we already made a copy and aren't modifying his/her original
    train.drop(columns=columns_to_drop, inplace=True)
    test.drop(columns=columns_to_drop, inplace=True)
  

#     same reason as above for inplace=True
    if remove_na:
        train.dropna(inplace=True)
        test.dropna(inplace=True)


    (train, test) = get_dummies(train, test,
                                columns=columns_for_dummies,
                                drop_first=drop_first,
                                inplace=inplace)
    

    return (train, test)


###################

def patrick_clean_data(train_df, test_df, variables, remove_na=True, drop_first=True, inplace=False):
    columns_to_drop = dropped_cols(test_df, variables)
    dummy_columns = dummy_vars_list(test_df, variables)
    new_train, new_test = clean_data(train_df, test_df, 
                                     columns_to_drop, 
                                     dummy_columns, 
                                     remove_na=remove_na, 
                                     drop_first=drop_first, 
                                     inplace=inplace)
    
    y_col_name = []
    for col in new_train.columns:
        if col not in new_test.columns:
            y = new_train[col]
            y_col_name = col
    new_train = new_train.drop(y_col_name, axis=1)
    
    assert check_compatibility(new_train, new_test)
    
#     for column in new_test.columns: # Turn this on if I want to see what columns were made
#         print(column)

    return new_train, new_test, y

In [19]:
new_train, new_test, y = patrick_clean_data(ames_train, ames_test, tested_col)
print(new_test.shape)
print(new_train.shape)
print(y.shape)


(58, 2)
(140, 2)
(140,)


In [20]:
new_train.columns

Index(['Overall Qual', 'Alley_is_Pave'], dtype='object')

In [None]:
ames_train[['Gr Liv Area', 'GLA_SQ', 'Total Bsmt SF', '1st Flr SF']].head()


In [None]:
'Overall Qual',            
            'Year Built',            
    'Gr Liv Area',            
    'Neighborhood',            
    'GLA_SQ',
'Quality_sq',
'GLA_OQ',
'quality_year',
'quality_sq_ft'

In [None]:
# poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

# # Fit and transform our X data.
# poly_train = poly.fit_transform(new_train)
# poly_test = poly.fit_transform(new_test)
# poly_train.shape

In [None]:
# poly.get_feature_names(tested_col)


In [None]:
def log_like_fn(val):
    if (val == 0):
        return 0
    else:
        return np.log(val)    

In [None]:
new_train['Gr Liv Area'] = new_train['Gr Liv Area'].map(log_like_fn)
new_train['GLA_SQ'] = new_train['GLA_SQ'].map(log_like_fn)
# new_train['Total Bsmt SF'] = new_train['Total Bsmt SF'].map(log_like_fn)
# new_train['1st Flr SF'] = new_train['1st Flr SF'].map(log_like_fn)
new_train['Quality_sq'] = new_train['Quality_sq'].map(log_like_fn)
new_train['GLA_OQ'] = new_train['GLA_OQ'].map(log_like_fn)
new_train['quality_year'] = new_train['quality_year'].map(log_like_fn)
new_train['quality_sq_ft'] = new_train['quality_sq_ft'].map(log_like_fn)


In [None]:
new_test['Gr Liv Area'] = new_test['Gr Liv Area'].map(log_like_fn)
new_test['GLA_SQ'] = new_test['GLA_SQ'].map(log_like_fn)
# new_test['Total Bsmt SF'] = new_test['Total Bsmt SF'].map(log_like_fn)
# new_test['1st Flr SF'] = new_test['1st Flr SF'].map(log_like_fn)
new_test['Quality_sq'] = new_test['Quality_sq'].map(log_like_fn)
new_test['GLA_OQ'] = new_test['GLA_OQ'].map(log_like_fn)
new_test['quality_year'] = new_test['quality_year'].map(log_like_fn)
new_test['quality_sq_ft'] = new_test['quality_sq_ft'].map(log_like_fn)


In [None]:
y = np.log(y)
# new_train['Total Bsmt SF'].sort_values(ascending=False)
# new_test.fillna(new_test.mean(), inplace=True)
# new_train.fillna(new_train.mean(), inplace=True)

In [None]:
# pt_new_train = new_train[['Gr Liv Area', 'GLA_SQ', 'Total Bsmt SF', '1st Flr SF']].apply(take_logTotal Bsmt SF', '1st Flr SF']]))
# pt_new_test = new_test[['Gr Liv Area', 'GLA_SQ', 'Total Bsmt SF', '1st Flr SF']].apply(take_log(new_test[['Gr Liv Area', 'GLA_SQ', 'Total Bsmt SF', '1st Flr SF']]))
# y = y.map(np.log)


In [None]:
def metrics_summary(X, y, test_set, cv, k, scaled=True, poly=False, n_alphas=100):
    
#     if poly:
#         poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
#         poly_train = poly.fit_transform(X)
#         poly_test = poly.fit_transform(new_test)
    
    # Scaling our data
    ss = StandardScaler()
    X_sc = ss.fit_transform(X)
    test_sc = ss.transform(test_set)

    # Instantiating our models
    model = LinearRegression()
    lasso = LassoCV(n_alphas=n_alphas, cv=5)    
    ridge = RidgeCV(cv=5)

    # Fitting our models on non-scaled and scaled data
    model.fit(X,y)
    model.fit(X_sc, y)
    lasso.fit(X_sc, y)
    ridge.fit(X_sc, y)

    # Running the predictions on our model
    model_p = model.predict(X)
    model_pred = model.predict(X_sc)
    lasso_pred = lasso.predict(X_sc)
    ridge_pred = ridge.predict(X_sc)

    resids_lr = y - model_p
    resids_lr_sc = y - model_pred
    resids_lasso = y - lasso_pred
    resids_ridge = y - ridge_pred

    rss_lr = (resids_lr ** 2).sum()
    rss_lr_sc = (resids_lr_sc ** 2).sum()
    rss_lasso = (resids_lasso ** 2).sum()
    rss_ridge = (resids_ridge ** 2).sum()

    r_squared = (metrics.r2_score(y, model_p))
    adj_r2 = 1 - (1-r_squared)*((len(y)-1)/(len(y)-X.shape[k]-1))

    # Running cross value scores
    cvs = cross_val_score(model, X_sc, y, cv=cv).mean()
    cvs_l = cross_val_score(lasso, X_sc, y, cv=cv).mean()
    cvs_r = cross_val_score(ridge, X_sc, y, cv=cv).mean()

    print(f'RSS (Residual Sum of Squares) : {(rss_lr)}')
    print(f'MAE (Mean Absolute Error) : {(metrics.mean_absolute_error(y, model_p))}') 
    print(f'MSE (Mean Square Error) : {(metrics.mean_squared_error(y, model_p))}')
    print()
    print(f'Root MSE (Root Meen Square Error Linear Non-Scaled) : {(np.sqrt(rss_lr / len(model_p)))}')
    print(f'Root MSE (Root Meen Square Error Linear Scaled) : {(np.sqrt(rss_lr_sc / len(model_pred)))}')
    print(f'Root MSE (Root Meen Square Error Lasso) : {(np.sqrt(rss_lasso / len(lasso_pred)))}')
    print(f'Root MSE (Root Meen Square Error Ridge) : {(np.sqrt(rss_ridge / len(ridge_pred)))}')
    print()
    print(f'R\u00b2 : {(r_squared)}')
    print(f'Adjusted R\u00b2 : {(adj_r2)}')
    print()
    print(f'CVS (Cross Value Score LR) : {(cvs)}')
    print(f'CVS (Cross Value Score Lasso) : {(cvs_l)}')
    print(f'CVS (Cross Value Score Ridge) : {(cvs_r)}')

    model.fit(X_sc, y)
    model_pred_final = model.predict(test_sc)
    lasso.fit(X_sc, y)
    lasso_pred_final = lasso.predict(test_sc)
    ridge.fit(X_sc, y)
    ridge_pred_final = ridge.predict(test_sc)

    if (cvs > cvs_l) and (cvs > cvs_r):
        prediction = pd.DataFrame({'Id': test_id['Id'], 'SalePrice': model_pred_final}).to_csv('wd_prediction_17.csv', index=False)
    elif (cvs_l > cvs) and (cvs_l > cvs_r):
        prediction = pd.DataFrame({'Id': test_id['Id'], 'SalePrice': lasso_pred_final}).to_csv('wd_prediction_17.csv', index=False)
    elif (cvs_r > cvs_l) and (cvs_r > cvs):
        prediction = pd.DataFrame({'Id': test_id['Id'], 'SalePrice': ridge_pred_final}).to_csv('wd_prediction_17.csv', index=False)
    return prediction, ridge_pred_final, lasso_pred_final, model_pred_final


In [None]:
prediction, ridge, lasso, linear = metrics_summary(new_train, y, new_test, 5, 1, scaled=True, poly=False)

In [None]:
prediction = np.exp(lasso)

In [None]:
np.exp(lasso)

In [None]:
prediction = pd.DataFrame({'Id': test_id['Id'], 'SalePrice': prediction}).to_csv('wd_prediction_21.csv', index=False)
