In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

In [2]:
#for warnings about default vaues being changed in the future
import warnings
warnings.filterwarnings("ignore")

In [3]:
ames = pd.read_csv('cleaned.csv')
test = pd.read_csv('cleaned_test.csv')

In [4]:
ames.drop(columns=['Unnamed: 0'], inplace=True)
test.drop(columns=['Unnamed: 0'], inplace=True)

In [44]:
def reformat(df):
    """
    adds all new features into the dataframe
    """
    #The greater of year built or the the most recent remodel
    new_col = []
    for i in df.index:
        if df['Year Built'][i] > df['Year Remod/Add'][i]:
            new_col.append(df['Year Built'][i])
        else:
            new_col.append(df['Year Remod/Add'][i])
        
    df['Most Recent'] = new_col
    
    #Age of thehouse divided into 7 groups based on distribution
    ss = StandardScaler()
    yr = pd.DataFrame(ss.fit_transform(df[['Most Recent']]), columns=['Most Recent'])
    
    new_col = []
    for i in yr.index:
        if yr['Most Recent'][i] >= .8:
            new_col.append('Very New')
        elif yr['Most Recent'][i] >= .5:
            new_col.append('New')
        elif yr['Most Recent'][i] >= .2:
            new_col.append('Newish')
        elif yr['Most Recent'][i] <= -.8:
            new_col.append('Very Old')
        elif yr['Most Recent'][i] <= -.5:
            new_col.append('Old')
        elif yr['Most Recent'][i] <= -.2:
            new_col.append('Oldish')
        else:
            new_col.append('Middle')

    df['Age'] = new_col
    
    #Whether or not theres a garage
    new_col = []
    for i in df.index:
        if df['Garage Type'][i] != 'No Garage':
            new_col.append('Yes')
        else:
            new_col.append('No')
        
    df['Has Garage'] = new_col
    
    #whether or not theres a fireplace
    new_col = []
    for i in df.index:
        if df['Fireplace Qu'][i] != 'None':
            new_col.append('Yes')
        else:
            new_col.append('No')
        
    df['Has Fireplace'] = new_col
    
    #whether or not theres a basement
    new_col = []
    for i in df.index:
        if df['Bsmt Qual'][i] != 'None':
            new_col.append('Yes')
        else:
            new_col.append('No')
        
    df['Has Basement'] = new_col
    
    #whether or not theres a pool
    new_col = []
    for i in df.index:
        if df['Pool QC'][i] != 'No Pool':
            new_col.append('Yes')
        else:
            new_col.append('No')
        
    df['Has Pool'] = new_col
    
    #Exterior Qual divided into good and bad
    new_col = []
    for i in df.index:
        if df['Exter Qual'][i] == 'Gd' or df['Exter Qual'][i] == 'Ex':
            new_col.append('Good')
        else:
            new_col.append('Bad')
            
    #simplified utilities
    new_col = []
    for i in df.index:
        if df['Utilities'][i] != 'AllPub':
            new_col.append('Good')
        else:
            new_col.append('Bad')
    df['Util'] = new_col
    
    #Level or severely slooped ground
    new_col = []
    for i in df.index:
        if df['Land Slope'][i] == 'Sev':
            new_col.append('Good')
        else:
            new_col.append('Bad')
    df['SevSlope'] = new_col
    
    #whether or not theres a porch
    new_col = []
    for i in df.index:
        if (df['Screen Porch'][i] != 0) or (df['3Ssn Porch'][i] != 0) or (df['Enclosed Porch'][i] != 0) or (df['Open Porch SF'][i] != 0) or (df['Wood Deck SF'][i] != 0):
            new_col.append('Yes')
        else:
            new_col.append('No')
    df['Has Porch'] = new_col
    
    #porch sf if theere is one otherwise 0
    new_col = []
    for i in df.index:
        if (df['Screen Porch'][i] != 0) or (df['3Ssn Porch'][i] != 0) or (df['Enclosed Porch'][i] != 0) or (df['Open Porch SF'][i] != 0) or (df['Wood Deck SF'][i] != 0):
            sf = []
            sf.append(df['Screen Porch'][i])
            sf.append(df['3Ssn Porch'][i])
            sf.append(df['Enclosed Porch'][i])
            sf.append(df['Open Porch SF'][i])
            sf.append(df['Wood Deck SF'][i])
            
            new_col.append(max(sf))
        else:
            new_col.append(0)
    df['Porch SF'] = new_col
    
    #Wd roof matl
    new_col = []
    for i in df.index:
        if df['Roof Matl'][i] == 'WdShngl' or df['Roof Matl'][i] == 'WdShake':
            new_col.append('Good')
        else:
            new_col.append('Bad')
    df['WdMatl'] = new_col
    
    #Whether the heating is gas based
    new_col = []
    for i in df.index:
        if df['Heating'][i] == 'GasA' or df['Roof Matl'][i] == 'GasW':
            new_col.append('Good')
        else:
            new_col.append('Bad')
    df['GasH'] = new_col
    
    #tennis court or elevator
    new_col = []
    for i in df.index:
        if df['Misc Feature'][i] == 'TenC' or df['Misc Feature'][i] == 'Elev' or df['Misc Feature'][i] == 'Othr':
            new_col.append('Yes')
        else:
            new_col.append('No')
    df['Tennis'] = new_col
    
    #Whether or not theres a second basement
    new_col = []
    for i in df.index:
        if df['BsmtFin SF 2'][i] != 0:
            new_col.append('Yes')
        else:
            new_col.append('No')
    df['Fin2'] = new_col
    
    #whether or not theres a wooden deck
    new_col = []
    for i in df.index:
        if df['Wood Deck SF'][i] != 0:
            new_col.append('Yes')
        else:
            new_col.append('No')
    df['Wood Deck'] = new_col
    
    #whther or not theres a fence
    new_col = []
    for i in df.index:
        if df['Fence'][i] != 'No Fence':
            new_col.append('Yes')
        else:
            new_col.append('No')
    df['Has Fence'] = new_col
    
    #whether its recession
    new_col = []
    for i in df.index:
        if (df['Yr Sold'][i] != 2008) and (df['Mo Sold'][i] >= 6):
            new_col.append('Yes')
        else:
            new_col.append('No')
    df['Recession'] = new_col 
        #consider starting in June
        
    #whether or not there was a remodel
    new_col = []
    for i in df.index:
        if df['Year Built'][i] != df['Year Remod/Add'][i]:
            new_col.append('Yes')
        else:
            new_col.append('No')
    df['Has Remod'] = new_col
    
    #whether or not theres unfinished basement
    new_col = []
    for i in df.index:
        if df['Bsmt Unf SF'][i] == 0:
            new_col.append('Yes')
        else:
            new_col.append('No')
    df['Unf Bsmt'] = new_col
    
    #whether or not theres a second floor
    new_col = []
    for i in df.index:
        if df['2nd Flr SF'][i] != 0:
            new_col.append('Yes')
        else:
            new_col.append('No')
    df['Has 2'] = new_col

    #how old the house is
    new_col = []
    for i in df.index:
        new_col.append(df['Yr Sold'] - df['Year Built'])
    df['House Age'] = new_col

In [6]:
def fix_types(df):
    """
    converts all numeric features to floats, ints can't be used for certain conversions
    converts all discrete features to strings to they can be converted to dummy varaibles
    """
    for col in features_nonpoly:
        df[col] = df[col].astype('float')
    for col in features_dis:
        df[col] = df[col].astype('object')

In [38]:
def format_split(df_train=ames, df_test=test):
    """
    Takes both the training and testing set and dummifies variables returning both seperatly
    Both have to be done simultaneously so that the columns stay the same and aligned
    """
    ind = 'Indicator'
    df_test[ind] = 'test'
    df_train[ind] = 'train'
    
    df = pd.concat([df_test,df_train])
    
    df_num = df[features_num]
    
    features_str = features_dis + features_cate + ['Indicator']
    df_str = df[features_str]
    
    df = pd.concat([df_str, df_num], axis=1)
    df = pd.get_dummies(df) #, drop_first=True took this out and it improved model
    
    X_train = df[df['Indicator_train'] == 1]
    X_test = df[df['Indicator_train'] == 0]
    
    X_train.drop(columns=['Indicator_train'], inplace=True)
    X_test.drop(columns=['Indicator_train'], inplace=True)
    
    return (X_train, X_test)

In [97]:
def build_model(X, y, model = 'LR',test = .1, rand=8):
    """
    Creates a train test split and creates the model based on which one is inputed
    Prints RMSE and cross val score
    Returns elements of the model
    
    Uses a pipeline to scale data
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test,random_state=8)
    if model == 'LR':
        ml = LinearRegression()
    elif model == 'Lasso':
        ml =  Pipeline([
                ('sc', StandardScaler()),
                ('lasso', LassoCV( cv =7))
                    ]) #alphas = np.logspace(-3,0,300),
    elif model == 'Ridge':
         ml =  Pipeline([
                ('sc', StandardScaler()),
                ('ridge', RidgeCV())
                    ])
    elif model == 'Elastic':
        ml =  Pipeline([
                ('sc', StandardScaler()),
                ('elastic', ElasticNetCV(cv=10))
                    ])
    ml.fit(X_train, y_train)
    
    y_hat = ml.predict(X_test)
    resid = np.exp(y_test) - np.exp(y_hat)
    
    print('RMSE:',np.sqrt((resid**2).sum()/len(X_test)))
    print('cvs:', cross_val_score(ml, X_train, y_train).mean())
    
    sns.scatterplot(x=np.exp(y_test),y=np.exp(y_hat))
    return (y_test, X_test, ml)

In [47]:
def fit_model(X, X_pred, attempt,model='LR', save=False):
    """
    builds and instance of the model using build_model helper function
    
    if save=True rebuilds the model using the entire training set and saves the predictions as a CSV file numbered by
    attempt parameter
    """
    (y_test, X_test, ml) = build_model(X=X, y=ames['Log SalePrice'],model=model)
    
    if save == True:
        if model == 'LR':
            ml = LinearRegression() 
        elif model == 'Lasso':
            ml =  Pipeline([
                ('sc', StandardScaler()),
                ('lasso', LassoCV(alphas = np.logspace(-3,0,100),cv=5))
                    ])
        elif model == 'Ridge':
            ml =  Pipeline([
                ('sc', StandardScaler()),
                ('ridge', RidgeCV())
                    ])
        elif model == 'Elastic':
            ml =  Pipeline([
                ('sc', StandardScaler()),
                ('elastic', ElasticNetCV(cv=10))
                    ])
        ml.fit(X, ames['Log SalePrice'])
        pred = ml.predict(X_pred)
    
        final = pd.DataFrame({'Id':test['Id'], 'SalePrice': np.exp(pred)})
        final.to_csv(f'trial{attempt}.csv', index=False)
    
    return (y_test, X_test, ml)

In [11]:
def create_log(df, to_log):   
    """
    creates log col for each varaible in to_log
    if a column contains zero adds 1 to everything to prevent -inf
    """
    for col in to_log:
        df[col] = df[col].astype('float')
        zero = False
        for i in df.index:
            if df[col][i] == 0:
                zero = True
                break
        if zero == True:
            df[f'Log {col}'] = np.log(df[col]+1)
        else:
            df[f'Log {col}'] = np.log(df[col])

In [12]:
def create_poly2(df, to_poly2):   
    """
    creates quadratic of each col in to_poly2
    """
    for col in to_poly2:
        df[col] = df[col].astype('float')
        df[f'{col}^2'] = df[col]*df[col]

In [13]:
def create_poly3(df, to_poly3):   
    """
    creates cubic of all columns in to_poly3
    """
    for col in to_poly3:
        df[col] = df[col].astype('float')
        df[f'{col}^3'] = df[col]*df[col]*df[col]
    pass

In [14]:
def create_inter(df):
    """
    creates interaction terms for all columns in the function
    """
    inter = ['Lot Area', 'Overall Qual', 'Overall Cond',
        'Year Built', 'Year Remod/Add', 'Mas Vnr Area',
       'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       '1st Flr SF', '2nd Flr SF', 'Gr Liv Area', 'Bsmt Full Bath', 
             'Bsmt Half Bath', 'Full Bath', 'Half Bath',
       'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces',
    'Garage Cars', 'Garage Area'
        ]
    
    for col1 in inter:
        for col2 in inter:
            if col1 != col2:
                df[f'{col1} * {col2}'] = df[col1] * df[col2]
    
    

In [15]:
def gen_xtr_features(conti):
    """
    generates all transformation and interaction columns for the given dataframe
    """
    create_inter(ames)
    create_inter(test)
    create_log(ames, conti)
    create_log(test, conti)
    create_poly2(ames, conti)
    create_poly2(test, conti)
    create_poly3(ames, conti)
    create_poly3(test, conti)

In [None]:
ames = pd.read_csv('cleaned.csv')
test = pd.read_csv('cleaned_test.csv')

ames2 = pd.read_csv('ames2_cleaned.csv')
test2 = pd.read_csv('test2_cleaned.csv')

continuous = ['Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
 '2nd Flr SF','Garage Area', 'Gr Liv Area', 'Lot Area', '1st Flr SF', 
              'Year Built',
              #inter
             'Year Built * Garage Area',
             'Year Built * Gr Liv Area',
             'Year Built * Garage Cars',
             'Pool Area',
              'Porch SF',
             ]
#split age again
#garage yr?
features_num = ['Log Lot Area', 
                'Garage Area^2',
                'BsmtFin SF 1^2', 
                'Total Bsmt SF^2',
               '1st Flr SF^2', 
                'Gr Liv Area', 
               'Mas Vnr Area', 
                'Bsmt Unf SF^2',
                '2nd Flr SF^2', 
                'Year Built^2',
                'Pool Area',
                 'Porch SF' , 
                #interactions
                'Overall Qual * Gr Liv Area', 
                'Overall Qual * Garage Area', 
                'Lot Area * Mas Vnr Area', 
                'BsmtFin SF 1 * Gr Liv Area', 
                'BsmtFin SF 1 * TotRms AbvGrd', 
                'BsmtFin SF 1 * Lot Area',    
                'Lot Area * Fireplaces',
                'Lot Area * Full Bath', 
                'Total Bsmt SF * Full Bath',  
                '1st Flr SF * Garage Cars', 
                'Full Bath * Garage Area', 
                'TotRms AbvGrd * Garage Area', 
                'BsmtFin SF 1 * Fireplaces', 
                '2nd Flr SF * Bedroom AbvGr',
                'Year Built * Gr Liv Area^2', 
                
              ]

#features to have outliers removed
features_nonpoly = ['Lot Area', 'Garage Area',
                 'Total Bsmt SF',
               '1st Flr SF', 'Gr Liv Area', 
               'Mas Vnr Area',  
                ]


#categorical features
features_cate = ['Exter Qual', 'Bsmt Exposure',
                 'Has Garage', 'Has Fireplace', 'Kitchen Qual',
                'Bsmt Cond', 'Neighborhood', 
                'House Style', 'Sale Type', 'Condition 1', 'Condition 2',
                 'Central Air', 'Utilities', 'WdMatl',
                'Exterior 1st', 'Exterior 2nd', 'Exter Cond',
                'Garage Qual', 'Tennis','Functional','Fin2', 
                 'Land Contour', 'Garage Type','Has Mas', 
                 'Has Remod', 'Unf Bsmt',
                'Has 2', 'Has Basement',
                  'Has Fence', 'Fence',  
                 'Has Pool','Pool QC', 
                 'Has Porch',
                ]



#Discrete features
features_dis = ['Overall Qual',
                'Overall Cond','MS SubClass',
               'MS Zoning', 
                'Full Bath', 
                 'TotRms AbvGrd', 'Bedroom AbvGr', 
                'Mas Vnr Type',
                'Garage Cars', 'Half Bath', 
                'Bsmt Half Bath',
               ]

all_features = features_num + features_cate + features_dis

#adds new featuers
reformat(ames)
reformat(test)

#type changes for dummies and transformations
fix_types(ames)
fix_types(test)

#generates transfromations and interations
gen_xtr_features(continuous)

#Remove outliers of std 4 from the mean
for col in features_nonpoly:
    mean = np.mean(ames[col])
    std = np.std(ames[col])
    thres = mean + std*4
    ames = ames[(ames[col] <= thres) & (ames[col] >= -1*thres)]


#removes sale price outliers
col = 'SalePrice'
mean = np.mean(ames[col])
std = np.std(ames[col])
thres = mean + std*4
ames = ames[(ames[col] <= thres) & (ames[col] >= -1*thres)]

#makes dummies
ames_adj, test2_adj = format_split(ames, test)

#builds model
y_, X_, ml2 = fit_model(ames_adj, test2_adj,attempt='LR', model='Ridge', save=False)