In [61]:
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict
import math

In [2]:
noisyDF= pd.read_csv('train.csv')
print(noisyDF.columns)
noisyDF.head()

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [75]:
def checkNAN(df):
    missVals= dict()
    cols= df.columns
    for col in cols:
        percentMissing= df[col].isna().sum()/len(df[col]) * 100.00
        if percentMissing > 0.00:
            missVals[col]= '{:5.4f}'.format(percentMissing)+ '%'
    return missVals

def majorMissing(df):
    missVals= dict()
    cols= df.columns
    for col in cols:
        percentMissing= df[col].isna().sum()/len(df[col]) * 100.00
        if percentMissing > 50.00:
            missVals[col]= '{:5.4f}'.format(percentMissing) + '%'
    return missVals

def missColType(df):
    missType= dict()
    cols= df.columns
    missVals= checkNAN(df)
    for key in missVals.keys():
        missType[key]= df[key].dtype
    return missType   

def graphDiscrete(df, col):
    years= list(pd.unique(df[col]))
    for year in years:
        if math.isnan(year):
           #remove nan from years 
    vals= OrderedDict([(year,0) for year in years])
    for year in df[col]:
        if math.isnan(float(year)):
            vals[year]+=1
    return vals

SyntaxError: invalid syntax (<ipython-input-75-73814b38a94e>, line 29)

In [26]:
pprint(majorMissing(noisyDF))

{'Alley': '93.7671',
 'Fence': '80.7534',
 'MiscFeature': '96.3014',
 'PoolQC': '99.5205'}


## So we have 4 features which are missing more than 50% of data. I will drop these features from the dataset. 

In [27]:
noisyDF= noisyDF.drop(['Alley','Fence','MiscFeature','PoolQC'], axis= 1)

## Now that we have got rid of major missing features, we fix other features with missing data

In [33]:
pprint(checkNAN(noisyDF))

{'BsmtCond': '2.5342%',
 'BsmtExposure': '2.6027%',
 'BsmtFinType1': '2.5342%',
 'BsmtFinType2': '2.6027%',
 'BsmtQual': '2.5342%',
 'Electrical': '0.0685%',
 'FireplaceQu': '47.2603%',
 'GarageCond': '5.5479%',
 'GarageFinish': '5.5479%',
 'GarageQual': '5.5479%',
 'GarageType': '5.5479%',
 'GarageYrBlt': '5.5479%',
 'LotFrontage': '17.7397%',
 'MasVnrArea': '0.5479%',
 'MasVnrType': '0.5479%'}


## FireplaceQu is missing 47% of data. So we will drop this one too.

In [34]:
noisyDF= noisyDF.drop(['FireplaceQu'], axis= 1)

In [35]:
pprint(checkNAN(noisyDF))

{'BsmtCond': '2.5342%',
 'BsmtExposure': '2.6027%',
 'BsmtFinType1': '2.5342%',
 'BsmtFinType2': '2.6027%',
 'BsmtQual': '2.5342%',
 'Electrical': '0.0685%',
 'GarageCond': '5.5479%',
 'GarageFinish': '5.5479%',
 'GarageQual': '5.5479%',
 'GarageType': '5.5479%',
 'GarageYrBlt': '5.5479%',
 'LotFrontage': '17.7397%',
 'MasVnrArea': '0.5479%',
 'MasVnrType': '0.5479%'}


## To fix the missing features, we need to first check the data types of the features.

In [47]:
pprint(missColType(noisyDF))

{'BsmtCond': dtype('O'),
 'BsmtExposure': dtype('O'),
 'BsmtFinType1': dtype('O'),
 'BsmtFinType2': dtype('O'),
 'BsmtQual': dtype('O'),
 'Electrical': dtype('O'),
 'GarageCond': dtype('O'),
 'GarageFinish': dtype('O'),
 'GarageQual': dtype('O'),
 'GarageType': dtype('O'),
 'GarageYrBlt': dtype('float64'),
 'LotFrontage': dtype('float64'),
 'MasVnrArea': dtype('float64'),
 'MasVnrType': dtype('O')}


## The classification of datatypes for the missing features are:
### Numerical :
1. GarageYrBuilt
2. LotFrontage
3. MasVnrArea

### Categorical:
1. BsmtCond
2. BsmtExposure
3. BsmtFinType1
4. BsmtFinType2
5. BsmtQual
6. Electrical
7. GarageCond
8. GarageFinish
9. GarageQual
10. GarageType
11. MasVnrType

## Handling Numerical Data
Looking at if the numerical data is continuous or discrete

In [55]:
print(list(pd.unique(noisyDF['GarageYrBlt'])))

[2003.0, 1976.0, 2001.0, 1998.0, 2000.0, 1993.0, 2004.0, 1973.0, 1931.0, 1939.0, 1965.0, 2005.0, 1962.0, 2006.0, 1960.0, 1991.0, 1970.0, 1967.0, 1958.0, 1930.0, 2002.0, 1968.0, 2007.0, 2008.0, 1957.0, 1920.0, 1966.0, 1959.0, 1995.0, 1954.0, 1953.0, nan, 1983.0, 1977.0, 1997.0, 1985.0, 1963.0, 1981.0, 1964.0, 1999.0, 1935.0, 1990.0, 1945.0, 1987.0, 1989.0, 1915.0, 1956.0, 1948.0, 1974.0, 2009.0, 1950.0, 1961.0, 1921.0, 1900.0, 1979.0, 1951.0, 1969.0, 1936.0, 1975.0, 1971.0, 1923.0, 1984.0, 1926.0, 1955.0, 1986.0, 1988.0, 1916.0, 1932.0, 1972.0, 1918.0, 1980.0, 1924.0, 1996.0, 1940.0, 1949.0, 1994.0, 1910.0, 1978.0, 1982.0, 1992.0, 1925.0, 1941.0, 2010.0, 1927.0, 1947.0, 1937.0, 1942.0, 1938.0, 1952.0, 1928.0, 1922.0, 1934.0, 1906.0, 1914.0, 1946.0, 1908.0, 1929.0, 1933.0]


In [74]:
graphDiscrete(noisyDF, 'GarageYrBlt')

KeyError: nan