# Exploring the dataset

In [527]:
import pandas as pd
from collections import Counter

In [528]:
pd.set_option('display.max_rows', 100)

In [536]:
train = pd.read_csv("~/Downloads/house_prices_train.csv")
# IMPORTANT NOTE: because Counter tallies each NaN separately, I've replaced NaN with NAN instead.
# NAN is a string
train = train.fillna("NAN")

## Details of the dataset as a whole
* number of columns
* list of columns

In [545]:
print("%s COLUMNS" % len(train.columns))
print(train.columns)

81 COLUMNS
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 

## Details of each column in the dataset
* unique values description
    * unique values
    * number of unique values
    * dtype of unique values

* occurrence of unique values
    * 

In [538]:
def describe_uni_vals(dataframe):
    """Describes the unique values in each pd.Series or column in a pd.DataFrame
    
    Args
        dataframe: a pd.DataFrame
    
    Returns
        
    """
    # unique values for each column
    uni_vals_description = pd.DataFrame(columns=["uni_vals","num_uni_vals", "dtype"])
    
    for col_name in dataframe:
        # NEED TO OPTIMIZE/CLEAN UP
        uni_vals = dataframe[col_name].unique()
        num_uni_vals = len(uni_vals)
        dtype = uni_vals.dtype
        
        col_stats = pd.DataFrame(index=[col_name], columns=["uni_vals","num_uni_vals", "dtype"])
        col_stats["uni_vals"] = [uni_vals]
        col_stats["num_uni_vals"] = num_uni_vals
        col_stats["dtype"] = dtype
        
        uni_vals_description = uni_vals_description.append(col_stats)
        
    return uni_vals_description

In [539]:
describe_uni_vals(train).sort_values(by="num_uni_vals")

Unnamed: 0,uni_vals,num_uni_vals,dtype
CentralAir,"[Y, N]",2,object
Street,"[Pave, Grvl]",2,object
Utilities,"[AllPub, NoSeWa]",2,object
PavedDrive,"[Y, N, P]",3,object
Alley,"[NAN, Grvl, Pave]",3,object
BsmtHalfBath,"[0, 1, 2]",3,int64
HalfBath,"[1, 0, 2]",3,int64
LandSlope,"[Gtl, Mod, Sev]",3,object
KitchenAbvGr,"[1, 2, 3, 0]",4,int64
PoolQC,"[NAN, Ex, Fa, Gd]",4,object


In [540]:
def uni_val_occurence(dataframe):
    results = {}
    for col_name in dataframe:
        column = dataframe[col_name]
        count = dict(Counter(column))
        results[col_name] = pd.Series(count)
    return results

In [541]:
uni_val_occurence(train)

{'1stFlrSF': 334      1
 372      1
 438      1
 480      1
 483      7
 495      1
 520      5
 525      1
 526      1
 536      1
 546      3
 551      1
 561      1
 572      2
 575      1
 576      1
 581      1
 596      1
 600      2
 605      1
 612      2
 616      5
 624      2
 625      2
 626      1
 630      9
 649      1
 658      1
 660      2
 661      1
 663      1
 664      1
 672     11
 673      1
 676      1
 679      1
 680      1
 682      1
 684      3
 686      1
 689      2
 691      2
 693      1
 694      3
 696      1
 697      2
 698      5
 702      1
 703      1
 707      2
         ..
 1932     1
 1940     2
 1944     1
 1959     1
 1966     1
 1968     1
 1973     1
 1976     1
 1980     1
 1987     1
 1992     1
 2000     2
 2018     1
 2020     2
 2028     1
 2036     1
 2042     1
 2046     1
 2053     1
 2069     2
 2073     1
 2076     1
 2084     1
 2097     1
 2110     1
 2113     1
 2117     1
 2121     1
 2129     1
 2136     1
 2156     1
 215