# Imports

## Libraries

In [50]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.stats.proportion import proportions_ztest

## Data

In [8]:
PATH = "../Data/"

In [12]:
df = pd.read_csv(PATH + "train.csv")

In [13]:
df.shape

(1460, 81)

In [14]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [21]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# Initial Data Understanding

## Making things categorical

In [23]:
df["MSSubClass"].value_counts()

MSSubClass
20     536
60     299
50     144
120     87
30      69
160     63
70      60
80      58
90      52
190     30
85      20
75      16
45      12
180     10
40       4
Name: count, dtype: int64

In [25]:
MSSUBCLASS = {

    20:"1-STORY 1946 & NEWER ALL STYLES"
    ,30:"1-STORY 1945 & OLDER"
    ,40:"1-STORY W/FINISHED ATTIC ALL AGES"
    ,45:"1-1/2 STORY - UNFINISHED ALL AGES"
    ,50:"1-1/2 STORY FINISHED ALL AGES"
    ,60:"2-STORY 1946 & NEWER"
    ,70:"2-STORY 1945 & OLDER"
    ,75:"2-1/2 STORY ALL AGES"
    ,80:"SPLIT OR MULTI-LEVEL"
    ,85:"SPLIT FOYER"
    ,90:"DUPLEX - ALL STYLES AND AGES"
    ,120:"1-STORY PUD (Planned Unit Development) - 1946 & NEWER"
    ,150:"1-1/2 STORY PUD - ALL AGES"
    ,160:"2-STORY PUD - 1946 & NEWER"
    ,180:"PUD - MULTILEVEL - INCL SPLIT LEV/FOYER"
    ,190:"2 FAMILY CONVERSION - ALL STYLES AND AGES"
    
}

In [32]:
df["Dwelling_Category"] = df["MSSubClass"].map(MSSUBCLASS)

In [34]:
MSZONING = {
    
    "A":"Agriculture"
    ,"C":"Commercial"
    ,"FV":"Floating Village Residential"
    ,"I":"Industrial"
    ,"RH":"Residential High Density"
    ,"RL":"Residential Low Density"
    ,"RP":"Residential Low Density Park"
    ,"RM":"Residential Medium Density"
    
}

In [38]:
df["Zoning_Category"] = df["MSZoning"].map(MSZONING)

In [39]:
df.shape

(1460, 83)

# Statistical Testings

## One Sample Proportion Test
- Testing if the sample is proportion with one another
- **Null Hypothesis: There is no difference**

In this example: We used the column [Street] to test if we are getting enough samples for the two categorical values we have:

1 = Gravel
0 = Pave

In [77]:
df["Street"].value_counts()

Street
Pave    1454
Grvl       6
Name: count, dtype: int64

In [71]:
p_val = 0.5

In [105]:
stat_test = proportions_ztest(count=(df["Street"] == "Grvl").sum() # of the "Yes" category
                              ,nobs=df.shape[0] #number of rows in the dataframe
                              ,value=p_val)[1]

print("P-Value: {0:0.2f}".format(stat_test))

P-Value: 0.00


With this result, we can *reject the null hypothesis* that there is no difference when it fact **there is a significant different**

## Chi Square Test

## T-Test

## ANOVA

## Correlation