In [1]:
# Import standard packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',100)

In [2]:
# Load data 
fpath =  "Data/ames-housing-dojo-for-ml.csv"
df = pd.read_csv(fpath)
df = df.set_index("PID")
# Define columns to use
columns_to_use = columns_to_use = ['SalePrice', 'Living Area Sqft', 'Lot Frontage', 'Bldg Type', 'Bedroom','Total Full Baths','MS Zoning','Street', 'Alley','Utilities']
df = df[columns_to_use]
df.head()

Unnamed: 0_level_0,SalePrice,Living Area Sqft,Lot Frontage,Bldg Type,Bedroom,Total Full Baths,MS Zoning,Street,Alley,Utilities
PID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
907227090,119900.0,864.0,60.0,1Fam,3,1.0,RL,Pave,,AllPub
527108010,320000.0,2462.0,134.0,1Fam,4,3.0,RL,Pave,,AllPub
534275170,151500.0,958.0,,1Fam,2,1.0,RL,Pave,,AllPub
528104050,385000.0,2084.0,114.0,1Fam,2,3.0,RL,Pave,,AllPub
533206070,193800.0,1565.0,32.0,TwnhsE,2,3.0,FV,Pave,Pave,AllPub


In [3]:
# Select columns to use
columns_to_use = ['SalePrice', 'Living Area Sqft', 'Lot Frontage', 'Bldg Type', 'Bedroom',
                      'Total Full Baths','MS Zoning','Street', 
                      'Alley','Utilities']
# Functionizing the load data from above for .py file
def load_data():
    fpath =  "Data/ames-housing-dojo-for-ml.csv"
    df = pd.read_csv(fpath)
    df = df.set_index("PID")
    df = df[columns_to_use]
    return df

In [4]:
# Verify the function works as expected
df = load_data()
df.head()

Unnamed: 0_level_0,SalePrice,Living Area Sqft,Lot Frontage,Bldg Type,Bedroom,Total Full Baths,MS Zoning,Street,Alley,Utilities
PID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
907227090,119900.0,864.0,60.0,1Fam,3,1.0,RL,Pave,,AllPub
527108010,320000.0,2462.0,134.0,1Fam,4,3.0,RL,Pave,,AllPub
534275170,151500.0,958.0,,1Fam,2,1.0,RL,Pave,,AllPub
528104050,385000.0,2084.0,114.0,1Fam,2,3.0,RL,Pave,,AllPub
533206070,193800.0,1565.0,32.0,TwnhsE,2,3.0,FV,Pave,Pave,AllPub


In [5]:
# Obtain summary statistics
df.describe().round(2)

Unnamed: 0,SalePrice,Living Area Sqft,Lot Frontage,Bedroom,Total Full Baths
count,2930.0,2930.0,2440.0,2930.0,2928.0
mean,180825.07,1499.69,69.22,2.85,2.0
std,79878.22,505.51,23.37,0.83,0.75
min,12789.0,334.0,21.0,0.0,0.0
25%,129500.0,1126.0,58.0,2.0,1.0
50%,160000.0,1442.0,68.0,3.0,2.0
75%,213500.0,1742.75,80.0,3.0,2.0
max,755000.0,5642.0,313.0,8.0,6.0


In [6]:
# In our notebook we can use df.info() to output our summary info into our console (direclty below the cell)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2930 entries, 907227090 to 902201120
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SalePrice         2930 non-null   float64
 1   Living Area Sqft  2930 non-null   float64
 2   Lot Frontage      2440 non-null   float64
 3   Bldg Type         2930 non-null   object 
 4   Bedroom           2930 non-null   int64  
 5   Total Full Baths  2928 non-null   float64
 6   MS Zoning         2930 non-null   object 
 7   Street            2930 non-null   object 
 8   Alley             198 non-null    object 
 9   Utilities         2930 non-null   object 
dtypes: float64(4), int64(1), object(5)
memory usage: 251.8+ KB


In [8]:
# Display .info()
# In order to display output on our app, we first need to capture it. 
# We can use an IO buffer to capture the output, then we will use the getvalue() argument to retrive it
from io import StringIO
# Create a string buffer to capture the content
buffer = StringIO()
# Write the info into the buffer
df.info(buf=buffer)
# Retrieve the content from the buffer
summary_info = buffer.getvalue()
# What the output looks like in the console
summary_info

"<class 'pandas.core.frame.DataFrame'>\nInt64Index: 2930 entries, 907227090 to 902201120\nData columns (total 10 columns):\n #   Column            Non-Null Count  Dtype  \n---  ------            --------------  -----  \n 0   SalePrice         2930 non-null   float64\n 1   Living Area Sqft  2930 non-null   float64\n 2   Lot Frontage      2440 non-null   float64\n 3   Bldg Type         2930 non-null   object \n 4   Bedroom           2930 non-null   int64  \n 5   Total Full Baths  2928 non-null   float64\n 6   MS Zoning         2930 non-null   object \n 7   Street            2930 non-null   object \n 8   Alley             198 non-null    object \n 9   Utilities         2930 non-null   object \ndtypes: float64(4), int64(1), object(5)\nmemory usage: 251.8+ KB\n"

In [9]:
# Displaying null values
nulls =df.isna().sum()
nulls

SalePrice              0
Living Area Sqft       0
Lot Frontage         490
Bldg Type              0
Bedroom                0
Total Full Baths       2
MS Zoning              0
Street                 0
Alley               2732
Utilities              0
dtype: int64

In [10]:
# Check data type
type(nulls)

pandas.core.series.Series