In [122]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, KFold

df = pd.read_csv('AmesHousing.txt',sep='\t')

train = df[:1460]
test = df[1460:]
print(train.columns)


Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      

In [123]:
def transform_features(dframe):

    # dropping PID which is not useful

    newtrain = dframe
    
    newtrain = newtrain.drop('PID',axis=1)
           
    #Finding columns with percentage of missing values
    
    total = newtrain.isnull().sum().sort_values(ascending=False)
    percent = (newtrain.isnull().sum()/newtrain.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    #print(missing_data.head(20))
    
    newtrain = newtrain.drop(['Pool QC','Misc Feature','Alley','Fence','Fireplace Qu'],axis=1)
    
     # Selecting columns in train having less values missing
    
    train_null_counts = newtrain.isnull().sum()
    df_missing_values = newtrain[train_null_counts[(train_null_counts>0) & (train_null_counts<584)].index]
    #print('df_missing_values',df_missing_values.columns)
    
    # replacing missing value in float columns with mean of other values in column.
    
    float_int_cols = df_missing_values.select_dtypes(include=['int64','int', 'float'])
    #print('float_int_cols',float_int_cols.index)
    newtrain[float_int_cols.columns] = newtrain[float_int_cols.columns].fillna(df_missing_values.mean())  
    
    #Finding columns which are not null
    
    train_null_counts = newtrain.isnull().sum()
    #print(train_null_counts)

    trainNotNull = train_null_counts[train_null_counts==0]
    columns = trainNotNull.index

    df_no_mv = newtrain[columns]
    
    # Getting the text columns and changing it to category
        
    text_cols = df_no_mv.select_dtypes(include=['object']).columns
    

    for col in text_cols:
        #print(col+":", len(train[col].unique()))
        newtrain[col] = newtrain[col].astype('category')
    
    # finding category columns having very wide number of values
    
    #for col in text_cols:
        #print(col,newtrain[col].nunique())
        
    # Finding category columns which have more than 95% values of the same type
    sameValueColumns = []
    
    for col in text_cols:
        maxValueCountShare = pd.value_counts(newtrain[col]).max()/newtrain[col].count()
        #print(col,maxValueCountShare )
        #print(col,type(maxValueCountShare))
        #print(type(0.95))
        if (maxValueCountShare > 0.95):
            #print('here')
            sameValueColumns.append(col)
    
    # Removing the columns having more than 95 % same value from the list of text columns to be padded

    #print('dropping ',sameValueColumns)
    newtrain.drop(sameValueColumns,axis=1)
    
    
    text_cols = df_no_mv.select_dtypes(include=['object']).columns
       
    
    # Padding the remaining text columns
    for col in text_cols:
        col_dummies = pd.get_dummies(newtrain[col])
        newtrain = pd.concat([newtrain,col_dummies],axis=1)
        del newtrain[col]
  
    #print(newtrain[text_cols].nunique())
    #print(text_cols)
    
    #Adding new features
    newtrain['years_until_remod'] = newtrain['Year Remod/Add'] - newtrain['Year Built']
    
    newtrain = newtrain.dropna(axis=0, how='any')
    
    return newtrain

import seaborn as sns
%matplotlib inline

def select_features(dframeTrain,dframeTest):
    
    null_series = dframeTrain.isnull().sum()
    full_cols_series = null_series[null_series == 0]
    null_cols_series = null_series[null_series != 0]
    
    #print('full_cols_series',full_cols_series)
    #print('nullcolseries',null_cols_series)
    
    # select numerical features in dataset
    numeric_cols = dframeTrain.select_dtypes(include=[np.number]).columns.tolist()
    #sns.heatmap(dframeTrain[numeric_cols])
    #return train[['Gr Liv Area','SalePrice']]
    
    ### Splitting features
    #train_features = features.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
    return numeric_cols
    
def train_and_test(cv_type):
    
    if cv_type != 'kfold' and cv_type != 'simple':
        raise Exception('Only kfold or simple validation done. Wrong input')
        
    
    columns = select_features(newtrain,test)
    print(columns)
    columns = columns.remove('SalePrice')
    kf = KFold(10, shuffle=True, random_state=1)
    model = linear_model.LinearRegression()
    mses = cross_val_score(model,train[columns],train["SalePrice"],scoring="neg_mean_squared_error", cv=kf)
    rmses = [np.sqrt(np.absolute(mse)) for mse in mses]
    avg_rmse = np.mean(rmses)
    res = [mses,rmses]
    return res
    

In [124]:
newtrain = transform_features(train)
#newtest = transform_features(test)

#select_features(newtrain,test)

res = train_and_test('kfold')

print(res)



['Order', 'MS SubClass', 'Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold', 'SalePrice', 'C (all)', 'FV', 'I (all)', 'RH', 'RL', 'RM', 'Grvl', 'Pave', 'IR1', 'IR2', 'IR3', 'Reg', 'Bnk', 'HLS', 'Low', 'Lvl', 'AllPub', 'NoSeWa', 'NoSewr', 'Corner', 'CulDSac', 'FR2', 'FR3', 'Inside', 'Gtl', 'Mod', 'Sev', 'Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'Greens', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 

ValueError: cannot label index with a null key

In [None]:
train.columns