# Kagle Submission

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [26]:
kagle_test_data = pd.read_csv('../data/housing_kagle_test.csv')
kagle_test_data.shape

(1459, 80)

In [27]:
def getMissingDataFeatures(df):
    ser_null_columns = df.isnull().any(axis=0)
    lst_null_columns = [x for x in ser_null_columns.index if ser_null_columns[x]]
    return lst_null_columns

def getNullPercentage(df, feature):
    null_count = len(df[df[feature].isnull()])
    percent_of_nulls = null_count*100/len(df)
    return null_count, percent_of_nulls

def dropFeaturesWithNulls(df, lst_featrues, threshold=75):
    for feature in lst_featrues:
        null_count, percent_of_nulls = getNullPercentage(df, feature)
#        print('Null count in {0} : {1}, Percent of Null: {2}'.format(feature, null_count, percent_of_nulls))
        if(percent_of_nulls > threshold):
#            print('Droping --- {}'.format(feature))
            df.drop(feature, axis=1, inplace=True)
    return df

def getCatFeatures(df):
    cat_features = df.select_dtypes(include=['object']).columns
    return cat_features

def getCatFeaturesWithNulls(df):
    ser_null_columns = df[getCatFeatures(df)].isnull().any(axis=0)
    lst_null_columns = [x for x in ser_null_columns.index if ser_null_columns[x]]
    return lst_null_columns

In [28]:
def fillNAwithBlank(df, lst_features):
    for feature in lst_features:
        df[feature].fillna('', inplace=True)
    return df

In [29]:
def formatCondition(lst_features):
    copy_lst_features = lst_features.copy()
    for i, v in enumerate(lst_features):
        if(i == 0):
            copy_lst_features.insert(i*2, 'not ')
        else:    
            copy_lst_features.insert(i*2, ' and not ')
                      
    #print(lst_features)
    return ''.join(copy_lst_features)

def conditionBasedImputation(row, condition, lst_features):
    if condition:
        for feature in lst_features:
            row[feature] = 'NA'
    return row

In [30]:
class FillNAandCleanUp(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X.fillna(value=X.mean()[['LotFrontage']], inplace=True)
        X['MasVnrArea'].fillna(value=0, inplace=True)
        
        #interpolation, we have to reindex the dataframe. Then interpolate, then drop(reset) the index.
        X.index = X['YearBuilt']
        X['GarageYrBlt'] = X['GarageYrBlt'].interpolate()
        X.reset_index(drop=True, inplace=True)

        X.rename(columns={'1stFlrSF':'FstFlrSF', '2ndFlrSF':'SecndFlrSF', '3SsnPorch':'ThreeSsnPorch'}, inplace=True)
        X.BsmtCond.fillna('TA', inplace=True)

        # Droping inconsistent data from the data frame.
 #       X.drop(X[X['GarageYrBlt'] < X['YearBuilt']].index, inplace=True)

        X.drop('MasVnrArea', axis=1, inplace=True)
        X.drop('MasVnrType', axis=1, inplace=True)
        X = dropFeaturesWithNulls(X, getCatFeaturesWithNulls(X))
        
        lst_featurs = ['BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
        X = fillNAwithBlank(X, lst_featurs)
        
        lst_features_bsmt = ['BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
        condition_bsmt = formatCondition(lst_features_bsmt)
        X = X.apply(lambda row: conditionBasedImputation(row, condition_bsmt, lst_features_bsmt), axis=1)
        
        lst_features_garag = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
        condition_garag = formatCondition(lst_features_garag)
        X = X.apply(lambda row: conditionBasedImputation(row, condition_garag, lst_features_garag), axis=1)

        X.drop(X[X.Electrical.isnull()].index, inplace=True)
        X.drop('FireplaceQu', axis = 1, inplace = True)
        return X

In [31]:
fillna_cleanup_pipeline = Pipeline([
         ('fillNAandCleanUp', FillNAandCleanUp())
      ])

In [32]:
class CatFeatureCleanUpAndEncode(BaseEstimator, TransformerMixin):
    def __init__(self, lst_categories, feature, replace_val = None, lblEncode= None):
        self.lst_categories = lst_categories
        self.feature = feature
        if replace_val:
            self.replace_val = replace_val
        else:
            self.replace_val = 'OO_TH_ER'
            
        if lblEncode == None:
            self.lblEncode = True
        else:
            self.lblEncode = False
            
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        ser_non_cat_values = X[~ X[self.feature].isin(self.lst_categories)][self.feature]
        if(ser_non_cat_values.shape[0] > 0):
            X.loc[X[~ X[self.feature].isin(self.lst_categories)].index, self.feature] = self.replace_val
        
        # Convert to categorical feature
        X[self.feature] = pd.Categorical(X[self.feature], categories=self.lst_categories)
        
        if self.lblEncode:
            # Do label encoding
            X[self.feature] = X[self.feature].cat.codes
        else:
            # Do One-Hot encoding
            X = pd.get_dummies(X, columns=[self.feature], prefix=[self.feature], drop_first=True)
        return X

In [33]:
lst_MSZoning_cat = ['A','C','FV','I','RH','RL','RP','RM']
lst_Street_cat = ['Grvl','Pave']
lst_LotShape_cat = ['Reg','IR1','IR2','IR3']
lst_LandContour_cat = ['Lvl','Bnk','HLS','Low']
lst_Utilities_cat = ['AllPub','NoSewr','NoSeWa','ELO']
lst_LandSlope_cat = ['Gtl','Mod','Sev']
lst_Condition1_cat = ['Artery','Feedr','Norm','RRNn','RRAn','PosN','PosA','RRNe','RRAe']
lst_Condition2_cat = ['Artery','Feedr','Norm','RRNn','RRAn','PosN','PosA','RRNe','RRAe']
lst_BldgType_cat = ['1Fam','2FmCon','Duplx','TwnhsE','TwnhsI']
lst_HouseStyle_cat = ['1Story','1.5Fin','1.5Unf','2Story','2.5Fin','2.5Unf','SFoyer','SLvl']
lst_RoofStyle_cat = ['Flat','Gable','Gambrel','Hip','Mansard','Shed']
lst_RoofMatl_cat = ['ClyTile','CompShg','Membran','Metal','Roll','Tar&Grv','WdShake','WdShngl']
lst_Exterior1st_cat = ['AsbShng','AsphShn','BrkComm','BrkFace','CBlock','CemntBd','HdBoard','ImStucc','MetalSd','Other','Plywood',\
                       'PreCast','Stone','Stucco','VinylSd','Wd Sdng','WdShing']
lst_Exterior2nd_cat = ['AsbShng','AsphShn','BrkComm','BrkFace','CBlock','CemntBd','HdBoard','ImStucc','MetalSd','Other','Plywood',\
                       'PreCast','Stone','Stucco','VinylSd','Wd Sdng','WdShing']
lst_ExterQual_cat = ['Ex','Gd','TA','Fa','Po']
lst_ExterCond_cat = ['Ex','Gd','TA','Fa','Po']
lst_Foundation_cat = ['BrkTil','CBlock','PConc','Slab','Stone','Wood']
lst_BsmtQual_cat = ['Ex','Gd','TA','Fa','Po','NA']
lst_BsmtCond_cat = ['Ex','Gd','TA','Fa','Po','NA']
lst_BsmtExposure_cat = ['Gd','Av','Mn','No','NA'] 
lst_BsmtFinType1_cat = ['GLQ','ALQ','BLQ','Rec','LwQ','Unf','NA']
lst_BsmtFinType2_cat = ['GLQ','ALQ','BLQ','Rec','LwQ','Unf','NA']
lst_Heating_cat = ['Floor','GasA','GasW','Grav','OthW','Wall']
lst_HeatingQC_cat = ['Ex','Gd','TA','Fa','Po']
lst_CentralAir_cat = ['N','Y']
lst_Electrical_cat = ['SBrkr','FuseA','FuseF','FuseP','Mix']
lst_KitchenQual_cat = ['SBrkr','FuseA','FuseF','FuseP','Mix']
lst_KitchenQual_cat = ['Ex','Gd','TA','Fa','Po']
lst_Functional_cat = ['Typ','Min1','Min2','Mod','Maj1','Maj2','Sev','Sal']
lst_GarageType_cat = ['2Types','Attchd','Basment','BuiltIn','CarPort','Detchd','NA']
lst_GarageFinish_cat = ['Fin','RFn','Unf','NA']
lst_GarageQual_cat = ['Ex','Gd','TA','Fa','Po','NA']
lst_GarageCond_cat = ['Ex','Gd','TA','Fa','Po','NA']
lst_PavedDrive_cat = ['Y','P','N']
lst_SaleType_cat = ['WD ','CWD','VWD','New','COD','Con','ConLw','ConLI','ConLD','Oth']
lst_SaleCondition_cat = ['Normal','Abnorml','AdjLand','Alloca','Family','Partial']
                      
                      
                      
lst_LotConfig_cat = ['Inside','Corner','CulDSac','FR2','FR3']
lst_Neighborhood_cat = ['Blmngtn','Blueste','BrDale','BrkSide','ClearCr','CollgCr','Crawfor',\
                        'Edwards','Gilbert','IDOTRR','MeadowV','Mitchel','Names','NoRidge',\
                        'NPkVill','NridgHt','NWAmes','OldTown','SWISU','Sawyer','SawyerW',\
                        'Somerst','StoneBr','Timber','Veenker']

In [34]:
cat_feature_encod_pipeline = Pipeline([
        ('lst_MSZoning_cat', CatFeatureCleanUpAndEncode(lst_MSZoning_cat, 'MSZoning', 'C')),
        ('lst_Street_cat', CatFeatureCleanUpAndEncode(lst_Street_cat, 'Street')),
        ('lst_LotShape_cat', CatFeatureCleanUpAndEncode(lst_LotShape_cat, 'LotShape')),
        ('lst_LandContour_cat', CatFeatureCleanUpAndEncode(lst_LandContour_cat, 'LandContour')),
        ('lst_Utilities_cat', CatFeatureCleanUpAndEncode(lst_Utilities_cat, 'Utilities')),
        ('lst_LandSlope_cat', CatFeatureCleanUpAndEncode(lst_LandSlope_cat, 'LandSlope')),
        ('lst_Condition1_cat', CatFeatureCleanUpAndEncode(lst_Condition1_cat, 'Condition1')),    
        ('lst_Condition2_cat', CatFeatureCleanUpAndEncode(lst_Condition2_cat, 'Condition2')),
        ('lst_BldgType_cat', CatFeatureCleanUpAndEncode(lst_BldgType_cat, 'BldgType')),    
        ('lst_HouseStyle_cat', CatFeatureCleanUpAndEncode(lst_HouseStyle_cat, 'HouseStyle')),
        ('lst_RoofStyle_cat', CatFeatureCleanUpAndEncode(lst_RoofStyle_cat, 'RoofStyle')),
        ('lst_RoofMatl_cat', CatFeatureCleanUpAndEncode(lst_RoofMatl_cat, 'RoofMatl')),
        ('lst_Exterior1st_cat', CatFeatureCleanUpAndEncode(lst_Exterior1st_cat, 'Exterior1st')),    
        ('lst_Exterior2nd_cat', CatFeatureCleanUpAndEncode(lst_Exterior2nd_cat, 'Exterior2nd')),
        ('lst_ExterQual_cat', CatFeatureCleanUpAndEncode(lst_ExterQual_cat, 'ExterQual')), 
        ('lst_ExterCond_cat', CatFeatureCleanUpAndEncode(lst_ExterCond_cat, 'ExterCond')),
        ('lst_Foundation_cat', CatFeatureCleanUpAndEncode(lst_Foundation_cat, 'Foundation')),
        ('lst_BsmtQual_cat', CatFeatureCleanUpAndEncode(lst_BsmtQual_cat, 'BsmtQual')),
        ('lst_BsmtCond_cat', CatFeatureCleanUpAndEncode(lst_BsmtCond_cat, 'BsmtCond')),    
        ('lst_BsmtExposure_cat', CatFeatureCleanUpAndEncode(lst_BsmtExposure_cat, 'BsmtExposure')),
        ('lst_BsmtFinType1_cat', CatFeatureCleanUpAndEncode(lst_BsmtFinType1_cat, 'BsmtFinType1')), 
        ('lst_BsmtFinType2_cat', CatFeatureCleanUpAndEncode(lst_BsmtFinType2_cat, 'BsmtFinType2')),
        ('lst_Heating_cat', CatFeatureCleanUpAndEncode(lst_Heating_cat, 'Heating')),
        ('lst_HeatingQC_cat', CatFeatureCleanUpAndEncode(lst_HeatingQC_cat, 'HeatingQC')),
        ('lst_CentralAir_cat', CatFeatureCleanUpAndEncode(lst_CentralAir_cat, 'CentralAir')),
        ('lst_Electrical_cat', CatFeatureCleanUpAndEncode(lst_Electrical_cat, 'Electrical')),    
        ('lst_KitchenQual_cat', CatFeatureCleanUpAndEncode(lst_KitchenQual_cat, 'KitchenQual')),
        ('lst_Functional_cat', CatFeatureCleanUpAndEncode(lst_Functional_cat, 'Functional')), 
        ('lst_GarageType_cat', CatFeatureCleanUpAndEncode(lst_GarageType_cat, 'GarageType')),
        ('lst_GarageFinish_cat', CatFeatureCleanUpAndEncode(lst_GarageFinish_cat, 'GarageFinish')),
        ('lst_GarageQual_cat', CatFeatureCleanUpAndEncode(lst_GarageQual_cat, 'GarageQual')),
        ('lst_GarageCond_cat', CatFeatureCleanUpAndEncode(lst_GarageCond_cat, 'GarageCond')),    
        ('lst_PavedDrive_cat', CatFeatureCleanUpAndEncode(lst_PavedDrive_cat, 'PavedDrive')),
        ('lst_SaleType_cat', CatFeatureCleanUpAndEncode(lst_SaleType_cat, 'SaleType')), 
        ('lst_SaleCondition_cat', CatFeatureCleanUpAndEncode(lst_SaleCondition_cat, 'SaleCondition')),
    
        ('lst_LotConfig_cat', CatFeatureCleanUpAndEncode(lst_LotConfig_cat, 'LotConfig', lblEncode= False)),
        ('lst_Neighborhood_cat', CatFeatureCleanUpAndEncode(lst_Neighborhood_cat, 'Neighborhood', lblEncode=False))
      ]) 

In [35]:
kagle_test_data = fillna_cleanup_pipeline.fit_transform(kagle_test_data)
kagle_test_data = cat_feature_encod_pipeline.transform(kagle_test_data)

In [36]:
kagle_test_data[getMissingDataFeatures(kagle_test_data)].dtypes

BsmtFinSF1      float64
BsmtFinSF2      float64
BsmtUnfSF       float64
TotalBsmtSF     float64
BsmtFullBath    float64
BsmtHalfBath    float64
GarageCars      float64
GarageArea      float64
dtype: object

In [37]:
kagle_test_data = kagle_test_data.fillna(value=kagle_test_data.mean())

In [38]:
X_features  = [feature for feature in kagle_test_data.columns if feature not in ['SalePrice', 'Id']]
print(X_features)

['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'FstFlrSF', 'SecndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ThreeSsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'LotConfig_Corner', 'LotConfi

In [39]:
import pickle
# load the model from disk
filename = '../models/lm_model_housing_data.sav'
loaded_lm_model = pickle.load(open(filename, 'rb'))

In [40]:
X_kagle_test = kagle_test_data[X_features].values

In [41]:
pred_X_kagle_test = loaded_lm_model.predict(X_kagle_test)

In [42]:
pred_X_kagle_test.shape

(1432,)

In [43]:
pred_X_kagle_test = pred_X_kagle_test.reshape(-1,1)

In [44]:
kagle_test_data.Id.values.shape

(1432,)

In [45]:
kagle_test_data_Id = kagle_test_data.Id.values.reshape(-1,1)

In [46]:
np_submit_format = np.hstack((kagle_test_data_Id, pred_X_kagle_test))
np_submit_format.shape

(1432, 2)

In [47]:
df_submit_format = pd.DataFrame(np_submit_format, columns=['Id', 'SalePrice' ])
df_submit_format.Id = np.int32(df_submit_format.Id)

In [48]:
df_submit_format.to_csv('../data/dj_submission.csv', index=False)

# Linear Algebric Way of solving an equation.

# Under fitting vs Over Fitting 
  * **Over Fitting : ** The model performs well on training data, but failes to generalize.
  <p>&nbsp;&nbsp;&nbsp;&nbsp;* Overfitting happens when the model is too complex relative to the amount of noisiness in the training data. The possible solutions are:</p>
  <p>&nbsp;&nbsp;&nbsp;&nbsp;* I) Simplify the model by selecting one with fewer parameters (e.g., a linear model rather than a high-degree ploynomial model), by reducing the number of attributes in the training data or by constraining the model (constraining a model to make it simpler and reduce the risk of overfitting is called **regularization** - which is controlled by a hyperparameter of a learning algorithm).</p>
  <p>&nbsp;&nbsp;&nbsp;&nbsp;* II) Gather more training data</p>
  <p>&nbsp;&nbsp;&nbsp;&nbsp;* III) Reduce the noise in training data (e.g., fix data errors and remove outliers).</p>
  * ** Unberfitting : ** The model is too simple to learn the underlying structure of the data. To fix this.
  <p>&nbsp;&nbsp;&nbsp;&nbsp; I) Select a more powerful model, with more parameters.</p>
  <p>&nbsp;&nbsp;&nbsp;&nbsp; II) Feeding better features to the learning algorithm (feature engineering)</p>
  <p>&nbsp;&nbsp;&nbsp;&nbsp; III) Reducing the constrains on the model (e.g., reducing the regularization hyperparameter.) </p>  

# Bias Variance Tread-Off 
  * **Bias : ** This part of the generalization error is due to wrong assumptions, such as assuming that the data is linear when it is acctually quadratic. A high Bias model is most likely to **underfit TRAIN dataset**. 
  * **Variance : ** This part is due to excessive sensitivity to small variations in the training data. A Model with many digrees of freedom is likely to have high variance, and thus to overfit the training data.
  * **Irreducible Error : ** This part is due to noise in the data. Only way to handle this by cleaning the data (e.g., fix the data source, such as broken sensors, delete outliers.)
<p>Increasing a model's complexity will typically increase its variance and reduce bias. Conversly, reduce a model's complexity increases its bias and reduces its variance. This is why it is called as Tread-Off</p>

# Main Challenges of Machine Learning

  * In short we may have to train ML algorithom on some data. The two things that may go wrong or 'Bad Algorithom' or 'Bad Data'.
  * **Insufficient Quantity or Train Data : ** Most Machine Learning algorithm takes a lot of data to work properly. Simple problems it may need thousnads of examples, for complex porblems like image processing and speech recognition millions of example are need (unless you can use parts of existing models - Deep Learning).
  * **Nonrepresentative Training Data : ** Inorder to generalize well, it is very important that your training data be representative of the new cases you want to generalize to. This is true whether you are using model-based or instance-based learning.
  <p>This is harder than it sounds, if the sample size is too small you will have sampling noice (i.e. nonrepresentative data as a result of chanse). Very large samples can be nonrepresentative if the sampling method is flawed (called as sampling bias). </p>
  <p> **Example : ** US presidential elections 1930, which pitted Landon against Roosevelt: The **Literary Digest** conducted a very large poll, sending mail to about 10 million people. It got 2.4 million answers, and predicted with high confidence that Landon would get 57% of the votes. Instead, Roosevelt won with 62% of the votes. The flaw was in **Literary Digest's** sampling method:</p>
  <p>
  
  <p>* &nbsp;&nbsp;&nbsp;&nbsp; I) Address obtaining process : Literary Digest used telephone directories, lists of meagazine subscribers, club membership lists, and the like. **All of these lists tend to favor wealthier people, who are more likely to vote Republican(hence Landon).**</p>
  <p> &nbsp;&nbsp;&nbsp;&nbsp; II) Less than 25% of the people who received the poll answered. Again, this introduces a sampling bias, by ruling out people who don't care much about politics, people who don't like the Literary Digest, and other key groups. This is a special type of sampling bias called nonreponse bias.</p>
  
  * ** Poor-Quality Data : ** If the training data is full of errors, outliers, and noise (e.g., due to poor-quality measurements), it will make it harder for the system to detect the underlying patterns, so the system is less likely to perform well. The truth is, most data scientists spend a significant part of their time in cleaning the data. (e.g., Discard outliers or fix the error manually. Ignore rows with missing data or impute).
  
  * **Irrelevent Features : ** System will be capable of learning if the training data contains enough relevant features and not too many irrelevent ones. A critical part of the suceess of a Machine Learning project is coming up with a good set of features to train on. This process is called **feature engineeting**, this involves.
  <p>&nbsp;&nbsp;&nbsp;&nbsp;I) **Feature Selection : ** selecting the most useful features to train on.</p>
  <p>&nbsp;&nbsp;&nbsp;&nbsp;II) **Feature Extraction : ** combine existing features to produce a more useful one(dimentionality reduction can help).</p>
  <p>&nbsp;&nbsp;&nbsp;&nbsp;III) **Create new features** by gathering new data.</p>
  