In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.preprocessing import Imputer

import copy
import argparse

%matplotlib inline

In [2]:
ames_train = pd.read_csv('./datasets/train.csv')
ames_test = pd.read_csv('./datasets/test.csv')
ames_test_1 = pd.read_csv('./datasets/test.csv')
print(ames_train.shape)
print(ames_test.shape)

(2051, 81)
(879, 80)


In [None]:
ames_train['base_1st'] = ames_train['Total Bsmt SF'] * ames_train['1st Flr SF']
ames_test['base_1st'] = ames_test['Total Bsmt SF'] * ames_test['1st Flr SF']

In [3]:
ames_train['quality_year'] = ames_train['Overall Qual'] * ames_train['Year Built']
ames_test['quality_year'] = ames_test['Overall Qual'] * ames_test['Year Built']

In [4]:
ames_train['quality_sq_ft'] = ames_train['Overall Qual'] * ames_train['1st Flr SF']
ames_test['quality_sq_ft'] = ames_test['Overall Qual'] * ames_test['1st Flr SF']

In [5]:
ames_train['garage_area_cars'] = ames_train['Garage Area'] * ames_train['Garage Cars']
ames_test['garage_area_cars'] = ames_test['Garage Area'] * ames_test['Garage Cars']

In [6]:
ames_train['qual_neig'] = ames_train['Neighborhood'] * ames_train['Overall Qual']
ames_test['qual_neig'] = ames_test['Neighborhood'] * ames_test['Overall Qual']

In [13]:
ames_train['Quality_sq'] = ames_train['Overall Qual'] * ames_train['Overall Qual']
ames_test['Quality_sq'] = ames_test['Overall Qual'] * ames_test['Overall Qual']

In [None]:
# ames_train['neig_sq_ft'] = ames_train['Neighborhood'] * ames_train['1st Flr SF']
# ames_test['neig_sq_ft'] = ames_test['Neighborhood'] * ames_test['1st Flr SF']

In [7]:
# ames_train._get_numeric_data
ames_test.fillna(ames_test.mean(), inplace=True)
ames_test.fillna('no_value_reported', inplace=True)

In [None]:
ames_test.columns

In [14]:
tested_col = ['Neighborhood',
            'Overall Qual',
            'Overall Cond', 
            'Year Built', 
            'Year Remod/Add', 
            'Total Bsmt SF', 
            '1st Flr SF',
            'Gr Liv Area',
            'Full Bath',
            'TotRms AbvGrd',
            'garage_area_cars',
            'quality_year',
            'quality_sq_ft',
            'qual_neig',
            'Quality_sq',
            ]

In [None]:
ames_train.corr()[['SalePrice']].sort_values('SalePrice', ascending=False)

In [None]:
# corr_1 = ames_train.corr()[['SalePrice', 
#             'Overall Qual',
#             'Overall Cond', 
#             'Year Built', 
#             'Year Remod/Add', 
#             'Total Bsmt SF', 
#             '1st Flr SF',
#             'Gr Liv Area',
#             'Full Bath',
#             'TotRms AbvGrd',
#             'garage_area_cars',
#             'quality_year',
#             'quality_sq_ft']]
# corr_1.shape

In [None]:
plt.figure(figsize = (40, 35))

mask = np.zeros_like(ames_train.corr())
mask[np.triu_indices_from(mask)] = True

ax = sns.heatmap(ames_train.corr(), mask=mask, annot=True, cmap='PuOr', vmax=1, vmin=-1,
            square=False, linewidths=1.5,  cbar_kws={"shrink": .5})


In [None]:
# # Creating a function that makes a power variable (feature) and puts that feature in a dataframe.
# def new_variable(series_1, series_2, series_name, df):
#     df[series_name] = series_1 * series_2
#     return df

In [15]:
def dropped_cols(df, tested_col):
    dropped_col = []
    for col in df:
        if col not in tested_col:
            dropped_col.append(col)
    return dropped_col  


def dummy_vars_list(df, tested_cols):
    dummy_vars_list = []
    for col in tested_cols:
        if df[col].dtype.name == 'object':
            dummy_vars_list.append(col)
    return dummy_vars_list            
    


def get_dummies(train, test, columns, drop_first=True,
                inplace=False):
    if not inplace:
        train = copy.deepcopy(train)
        test = copy.deepcopy(test)
        

    for column in columns:
        train_levels = set(train[column])
        test_levels = set(test[column])
        all_levels = sorted(train_levels.union(test_levels))
        if drop_first:
            all_levels = all_levels[1:]
        for level in all_levels:
            dummy_name = "%s_is_%s" % (column, level)
            train[dummy_name] = (train[column] == level) # Could be replaced with an if statement
            test[dummy_name] = (test[column] == level)
    train.drop(columns=columns, inplace=True)
    test.drop(columns=columns, inplace=True)

    
    # return only necessary if inplace=False
    return (train, test)

def check_compatibility(train, test):
    # assume compatible unless one of these checks fails
    if not (len(train.columns) == len(test.columns)):
        return False
    for column in train.columns:
        if not (column in test.columns):
            return False
    return True


def clean_data(train, test, columns_to_drop,
               columns_for_dummies, remove_na=True,
               drop_first=True, inplace=False):
    if not inplace:
        train = copy.deepcopy(train)
        test = copy.deepcopy(test)
  
    # inplace=True because if user called with inplace=False,
    # we already made a copy and aren't modifying his/her original
    train.drop(columns=columns_to_drop, inplace=True)
    test.drop(columns=columns_to_drop, inplace=True)
  

#     same reason as above for inplace=True
    if remove_na:
        train.dropna(inplace=True)
        test.dropna(inplace=True)


    (train, test) = get_dummies(train, test,
                                columns=columns_for_dummies,
                                drop_first=drop_first,
                                inplace=inplace)
    

    return (train, test)


###################

def patrick_clean_data(train_df, test_df, variables, remove_na=True, drop_first=True, inplace=False):
    columns_to_drop = dropped_cols(test_df, variables)
    dummy_columns = dummy_vars_list(test_df, variables)
    new_train, new_test = clean_data(train_df, test_df, 
                                     columns_to_drop, 
                                     dummy_columns, 
                                     remove_na=remove_na, 
                                     drop_first=drop_first, 
                                     inplace=inplace)
    
    y_col_name = []
    for col in new_train.columns:
        if col not in new_test.columns:
            y = new_train[col]
            y_col_name = col
    new_train = new_train.drop(y_col_name, axis=1)
    
    assert check_compatibility(new_train, new_test)
    
#     for column in new_test.columns: # Turn this on if I want to see what columns were made
#         print(column)

    return new_train, new_test, y

In [16]:
new_train, new_test, y = patrick_clean_data(ames_train, ames_test, tested_col,)
print(new_test.shape)
print(new_train.shape)
print(y.shape)

(879, 169)
(2049, 169)
(2049,)


In [17]:
def metrics_summary(X, y, test_set, cv, k, scaled=True):
    ss = StandardScaler()
    ss.fit(X)
    X_sc = ss.transform(X)
    test_sc = ss.transform(test_set)
    model = LinearRegression()
    lasso = LassoCV(cv=5)    
    ridge = RidgeCV(cv=5)
    model.fit(X,y)
    model_p = model.predict(X)
    resids = y - model_p
    rss = (resids ** 2).sum()
    r_squared = (metrics.r2_score(y, model_p))
    adj_r2 = 1 - (1-r_squared)*((len(y)-1)/(len(y)-X.shape[k]-1))
    cvs = cross_val_score(model, X_sc, y, cv=cv).mean()
    cvs_l = cross_val_score(lasso, X_sc, y, cv=cv).mean()
    cvs_r = cross_val_score(ridge, X_sc, y, cv=cv).mean()

    print(f'RSS (Residual Sum of Squares) : {(rss)}')
    print(f'MAE (Mean Absolute Error) : {(metrics.mean_absolute_error(y, model_p))}') 
    print(f'MSE (Mean Square Error) : {(metrics.mean_squared_error(y, model_p))}')
    print(f'Root MSE (Root Meen Square Error) : {(np.sqrt(rss / len(model_p)))}')
    print(f'R\u00b2 : {(r_squared)}')
    print(f'Adjusted R\u00b2 : {(adj_r2)}')
    print(f'CVS (Cross Value Score LR) : {(cvs)}')
    print(f'CVS (Cross Value Score Lasso) : {(cvs_l)}')
    print(f'CVS (Cross Value Score Ridge) : {(cvs_r)}')
    
    model.fit(X_sc, y)
    model_pred = model.predict(test_sc)
    lasso.fit(X_sc, y)
    lasso_pred = lasso.predict(test_sc)
    ridge.fit(X_sc, y)
    ridge_pred = ridge.predict(test_sc)
    
    if (cvs > cvs_l) and (cvs > cvs_r):
        prediction = pd.DataFrame(model_pred, columns=['SalePrice']).to_csv('wd_prediction_7.csv')
    elif (cvs_l > cvs) and (cvs_l > cvs_r):
        prediction = pd.DataFrame(lasso_pred, columns=['SalePrice']).to_csv('wd_prediction_7.csv')
    elif (cvs_r > cvs_l) and (cvs_r > cvs):
        prediction = pd.DataFrame(ridge_pred, columns=['SalePrice']).to_csv('wd_prediction_7.csv')
    return prediction


In [18]:
metrics_summary(new_train, y, new_test, 5, 1, scaled=True)

  return self.partial_fit(X, y)
  after removing the cwd from sys.path.
  """


RSS (Residual Sum of Squares) : 1147394421316.8352
MAE (Mean Absolute Error) : 16315.29914126168
MSE (Mean Square Error) : 559977755.6451124
Root MSE (Root Meen Square Error) : 23663.849129951628
R² : 0.9108232809540036
Adjusted R² : 0.9028025968035122
CVS (Cross Value Score LR) : -2.5980718928532623e+26
CVS (Cross Value Score Lasso) : 0.8818299299340783
CVS (Cross Value Score Ridge) : 0.8817498807302042
