<div style="display: flex; background-color: #3F579F;">
    <h1 style="margin: auto; font-weight: bold; padding: 30px 30px 0px 30px;" align="center">Consumption needs of buildings - Project 4</h1>
</div>
<div style="display: flex; background-color: #3F579F; margin: auto; padding: 5px 30px 0px 30px;" >
    <h2 style="width: 100%; text-align: center; float: left;" align="center">| Modeling notebook |</h2>
</div>
<div style="display: flex; background-color: #3F579F; margin: auto; padding: 10px 30px 30px 30px;">
    <h3 style="width: 100%; text-align: center; font-size:26px; float: left;" align="center">Data Scientist course - OpenClassrooms</h3>
</div>

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">1. Libraries and functions</h3>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">1.1. Libraries and functions</h3>
</div>

In [2]:
## df_analysis
import io
import gc
from math import prod

## General
import pandas as pd


from sklearn.model_selection import train_test_split


<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">1.2. Functions declaration</h4>
</div>

In [3]:
def df_analysis(df, name_df, *args, **kwargs):
    """
    Method used to analyze on the DataFrame.

    Parameters:
    -----------------
        df (pandas.DataFrame): Dataset to analyze
        name_df (str): Dataset name
        
        *args, **kwargs:
        -----------------
            columns (list): Dataframe keys in list format
            flag (str): Flag to show complete information about the dataset to analyse
                        "complete" shows all information about the dataset

    Returns:
    -----------------
        None. 
        Print the analysis on the Dataset. 
    """
    
    # Getting the variables
    columns = kwargs.get("columns", None)
    type_analysis = kwargs.get("type_analysis", None)
    
    ORDERING_COMPLETE = [
        "name", "type", "records", "unique", "# NaN", "% NaN", "mean", "min", "25%", "50%", "75%", "max", "std"
    ]
    
    # Calculating the memory usage based on dataframe.info()
    buf = io.StringIO()
    df.info(buf=buf)
    memory_usage = buf.getvalue().split('\n')[-2]
    
    if df.empty:
        print("The", name_df, "dataset is empty. Please verify the file.")
    else:
        empty_cols = [col for col in df.columns if df[col].isna().all()] # identifying empty columns
        df_rows_duplicates = df[df.duplicated()] #identifying full duplicates rows
        
        # Creating a dataset based on Type object and records by columns
        type_cols = df.dtypes.apply(lambda x: x.name).to_dict() 
        df_resume = pd.DataFrame(list(type_cols.items()), columns = ["name", "type"])
        df_resume["records"] = list(df.count())
        df_resume["# NaN"] = list(df.isnull().sum())
        df_resume["% NaN"] = list(((df.isnull().sum() / len(df.index))*100).round(2))
        
        print("\nAnalysis of", name_df, "dataset")
        print("--------------------------------------------------------------------")
        print("- Dataset shape:                 ", df.shape[0], "rows and", df.shape[1], "columns")
        print("- Total of NaN values:           ", df.isna().sum().sum())
        print("- Percentage of NaN:             ", round((df.isna().sum().sum() / prod(df.shape)) * 100, 2), "%")
        print("- Total of full duplicates rows: ", df_rows_duplicates.shape[0])
        print("- Total of empty rows:           ", df.shape[0] - df.dropna(axis="rows", how="all").shape[0]) if df.dropna(axis="rows", how="all").shape[0] < df.shape[0] else \
                    print("- Total of empty rows:            0")
        print("- Total of empty columns:        ", len(empty_cols))
        print("  + The empty column is:         ", empty_cols) if len(empty_cols) == 1 else \
                    print("  + The empty column are:         ", empty_cols) if len(empty_cols) >= 1 else None
        print("- Unique indexes:                ", df.index.is_unique)
        
        if columns is not None:
            print("\n- The key(s):", columns, "is not present multiple times in the dataframe.\n  It CAN be used as a primary key.") if df.size == df.drop_duplicates(columns).size else \
                print("\n- The key(s):", columns, "is present multiple times in the dataframe.\n  It CANNOT be used as a primary key.")
        
        if type_analysis == "summarized":
            print("\n")
        
        if type_analysis is None or type_analysis != "summarized":
            pd.set_option("display.max_rows", None) # show full of showing rows
            pd.set_option("display.max_columns", None) # show full of showing cols
            pd.set_option("display.max_colwidth", None) # show full width of showing cols
            pd.set_option("display.float_format", lambda x: "%.5f" % x) # show full content in cell    
            
            if type_analysis is None or type_analysis != "complete":
                print("\n- Type object and records by columns      (",memory_usage,")")
                print("--------------------------------------------------------------------")
            elif type_analysis == "complete":
                df_resume["unique"] = list(df.nunique())
                df_desc = pd.DataFrame(df.describe().T).reset_index()
                df_desc = df_desc.rename(columns={"index": "name"})
                df_resume = df_resume.merge(right=df_desc[["name", "mean", "min", "25%", "50%", "75%", "max", "std"]], on="name", how="left")
                df_resume = df_resume[ORDERING_COMPLETE]
                print("\n- Type object and records by columns      (",memory_usage,")")
                print("--------------------------------------------------------------------")
                
            display(df_resume.sort_values("records", ascending=False))
            
            pd.reset_option("display.max_rows") # reset max of showing rows
            pd.reset_option("display.max_columns") # reset max of showing cols
            pd.reset_option("display.max_colwidth") # reset width of showing cols
            pd.reset_option("display.float_format") # reset show full content in cell
            
        # deleting dataframe to free memory
        if type_analysis == "complete":
            del [[df_resume, df_desc]]
            gc.collect()
            df_resume, df_desc = (pd.DataFrame() for i in range(2))
        else:
            del df_resume
            gc.collect()
            df_resume = pd.DataFrame()

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">2. Importing files</h3>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">2.1. Importing and preparing files</h4>
</div>

In [4]:
data = pd.read_csv(r"datasets\df_cleaned_final.csv")

In [5]:
data.head()

Unnamed: 0,NumberofFloors,NumberofBuildings,BuildingAge,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),Neighborhood,renamed_PropertyType,SiteEnergyUse(kBtu),GHGEmissions(MetricTonsCO2e)
0,12,1,94,88434,0,88434,DOWNTOWN,Hotel,6981428.0,249.43
1,11,1,25,103566,15064,88502,DOWNTOWN,Hotel,8354235.0,263.51
2,41,1,52,956110,196718,759392,DOWNTOWN,Hotel,73130656.0,2061.48
3,10,1,95,61320,0,61320,DOWNTOWN,Hotel,28229320.0,1936.34
4,18,1,41,175580,62000,113580,DOWNTOWN,Hotel,14829099.0,507.7


In [19]:
df_analysis(data, "data", type_analysis="complete")


Analysis of data dataset
--------------------------------------------------------------------
- Dataset shape:                  1656 rows and 10 columns
- Total of NaN values:            0
- Percentage of NaN:              0.0 %
- Total of full duplicates rows:  0
- Total of empty rows:            0
- Total of empty columns:         0
- Unique indexes:                 True

- Type object and records by columns      ( memory usage: 129.5+ KB )
--------------------------------------------------------------------


Unnamed: 0,name,type,records,unique,# NaN,% NaN,mean,min,25%,50%,75%,max,std
0,NumberofFloors,int64,1656,42,0,0.0,4.13285,1.0,1.0,2.0,4.0,99.0,6.53546
1,NumberofBuildings,int64,1656,11,0,0.0,1.13164,1.0,1.0,1.0,1.0,111.0,2.91856
2,BuildingAge,int64,1656,113,0,0.0,59.22645,6.0,32.0,56.0,91.0,121.0,32.65619
3,PropertyGFATotal,int64,1656,1577,0,0.0,119192.86171,11285.0,29398.5,49289.5,105070.75,9320156.0,298038.52446
4,PropertyGFAParking,int64,1656,326,0,0.0,13321.64251,0.0,0.0,0.0,0.0,512608.0,43664.74773
5,PropertyGFABuilding(s),int64,1656,1572,0,0.0,105871.2192,3636.0,28356.0,47484.5,94479.0,9320156.0,284529.39591
6,Neighborhood,object,1656,13,0,0.0,,,,,,,
7,renamed_PropertyType,object,1656,8,0,0.0,,,,,,,
8,SiteEnergyUse(kBtu),float64,1656,1655,0,0.0,8563131.38206,11441.0,1222923.5,2512737.0,7009066.0,873923712.0,30439268.96365
9,GHGEmissions(MetricTonsCO2e),float64,1656,1583,0,0.0,186.12812,0.08,19.5275,48.84,139.35,16870.98,748.43963


<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">3. Splitting data</h3>
</div>

In [12]:
data.head()

Unnamed: 0,NumberofFloors,NumberofBuildings,BuildingAge,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),Neighborhood,renamed_PropertyType,SiteEnergyUse(kBtu),GHGEmissions(MetricTonsCO2e)
0,12,1,94,88434,0,88434,DOWNTOWN,Hotel,6981428.0,249.43
1,11,1,25,103566,15064,88502,DOWNTOWN,Hotel,8354235.0,263.51
2,41,1,52,956110,196718,759392,DOWNTOWN,Hotel,73130656.0,2061.48
3,10,1,95,61320,0,61320,DOWNTOWN,Hotel,28229320.0,1936.34
4,18,1,41,175580,62000,113580,DOWNTOWN,Hotel,14829099.0,507.7


In [8]:
X = data.drop(columns=["SiteEnergyUse(kBtu)", "GHGEmissions(MetricTonsCO2e)"])

In [13]:
y_energy = data.iloc[:,-2]

In [15]:
y_co2 = data.iloc[:,-1]

In [24]:
y_energy

0       6.981428e+06
1       8.354235e+06
2       7.313066e+07
3       2.822932e+07
4       1.482910e+07
            ...     
1651    8.497457e+05
1652    9.502762e+05
1653    5.765898e+06
1654    7.194712e+05
1655    1.152896e+06
Name: SiteEnergyUse(kBtu), Length: 1656, dtype: float64

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">2.1. Subset of Energy</h4>
</div>

In [44]:
X_train_to_energy, X_test_to_energy, y_energy_train, y_energy_test = train_test_split(X, y_energy, test_size=0.3, random_state=0)

In [21]:
def split_data(features, target):
    """
    Splitting dataframe through train_test_split

    Parameters:
    -----------------
        features (pandas.DataFrame): Features dataset to split
        target (pandas.series): Target feature
        
    Returns:
    -----------------
        None. 
        Print the analysis on the Dataset. 
    Returns:
    -----------------
        X_train (pandas.DataFrame): 70% of feautures dataset to train the modele
        X_test (pandas.DataFrame): 30% of feautures dataset to test the modele
        y_train (pandas.DataFrame): 70% of series target to train the modele
        y_test (pandas.DataFrame): 30% of series target to test the modele
    """
    
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)
    
    return (X_train, X_test, y_train, y_test)