<div style="display: flex; background-color: #3F579F;">
    <h1 style="margin: auto; font-weight: bold; padding: 30px 30px 0px 30px;" align="center">Consumption needs of buildings - Project 4</h1>
</div>
<div style="display: flex; background-color: #3F579F; margin: auto; padding: 5px 30px 0px 30px;" >
    <h2 style="width: 100%; text-align: center; float: left;" align="center">| Modeling notebook |</h2>
</div>
<div style="display: flex; background-color: #3F579F; margin: auto; padding: 10px 30px 30px 30px;">
    <h3 style="width: 100%; text-align: center; font-size:26px; float: left;" align="center">Data Scientist course - OpenClassrooms</h3>
</div>

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">1. Libraries and functions</h3>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">1.1. Libraries and functions</h3>
</div>

In [1]:
## df_analysis
import io
import gc
from math import prod

## General
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">1.2. Functions declaration</h4>
</div>

In [2]:
def df_analysis(df, name_df, *args, **kwargs):
    """
    Method used to analyze on the DataFrame.

    Parameters:
    -----------------
        df (pandas.DataFrame): Dataset to analyze
        name_df (str): Dataset name
        
        *args, **kwargs:
        -----------------
            columns (list): Dataframe keys in list format
            flag (str): Flag to show complete information about the dataset to analyse
                        "complete" shows all information about the dataset

    Returns:
    -----------------
        None. 
        Print the analysis on the Dataset. 
    """
    
    # Getting the variables
    columns = kwargs.get("columns", None)
    type_analysis = kwargs.get("type_analysis", None)
    
    ORDERING_COMPLETE = [
        "name", "type", "records", "unique", "# NaN", "% NaN", "mean", "min", "25%", "50%", "75%", "max", "std"
    ]
    
    # Calculating the memory usage based on dataframe.info()
    buf = io.StringIO()
    df.info(buf=buf)
    memory_usage = buf.getvalue().split('\n')[-2]
    
    if df.empty:
        print("The", name_df, "dataset is empty. Please verify the file.")
    else:
        empty_cols = [col for col in df.columns if df[col].isna().all()] # identifying empty columns
        df_rows_duplicates = df[df.duplicated()] #identifying full duplicates rows
        
        # Creating a dataset based on Type object and records by columns
        type_cols = df.dtypes.apply(lambda x: x.name).to_dict() 
        df_resume = pd.DataFrame(list(type_cols.items()), columns = ["name", "type"])
        df_resume["records"] = list(df.count())
        df_resume["# NaN"] = list(df.isnull().sum())
        df_resume["% NaN"] = list(((df.isnull().sum() / len(df.index))*100).round(2))
        
        print("\nAnalysis of", name_df, "dataset")
        print("--------------------------------------------------------------------")
        print("- Dataset shape:                 ", df.shape[0], "rows and", df.shape[1], "columns")
        print("- Total of NaN values:           ", df.isna().sum().sum())
        print("- Percentage of NaN:             ", round((df.isna().sum().sum() / prod(df.shape)) * 100, 2), "%")
        print("- Total of full duplicates rows: ", df_rows_duplicates.shape[0])
        print("- Total of empty rows:           ", df.shape[0] - df.dropna(axis="rows", how="all").shape[0]) if df.dropna(axis="rows", how="all").shape[0] < df.shape[0] else \
                    print("- Total of empty rows:            0")
        print("- Total of empty columns:        ", len(empty_cols))
        print("  + The empty column is:         ", empty_cols) if len(empty_cols) == 1 else \
                    print("  + The empty column are:         ", empty_cols) if len(empty_cols) >= 1 else None
        print("- Unique indexes:                ", df.index.is_unique)
        
        if columns is not None:
            print("\n- The key(s):", columns, "is not present multiple times in the dataframe.\n  It CAN be used as a primary key.") if df.size == df.drop_duplicates(columns).size else \
                print("\n- The key(s):", columns, "is present multiple times in the dataframe.\n  It CANNOT be used as a primary key.")
        
        if type_analysis == "summarized":
            print("\n")
        
        if type_analysis is None or type_analysis != "summarized":
            pd.set_option("display.max_rows", None) # show full of showing rows
            pd.set_option("display.max_columns", None) # show full of showing cols
            pd.set_option("display.max_colwidth", None) # show full width of showing cols
            pd.set_option("display.float_format", lambda x: "%.5f" % x) # show full content in cell    
            
            if type_analysis is None or type_analysis != "complete":
                print("\n- Type object and records by columns      (",memory_usage,")")
                print("--------------------------------------------------------------------")
            elif type_analysis == "complete":
                df_resume["unique"] = list(df.nunique())
                df_desc = pd.DataFrame(df.describe().T).reset_index()
                df_desc = df_desc.rename(columns={"index": "name"})
                df_resume = df_resume.merge(right=df_desc[["name", "mean", "min", "25%", "50%", "75%", "max", "std"]], on="name", how="left")
                df_resume = df_resume[ORDERING_COMPLETE]
                print("\n- Type object and records by columns      (",memory_usage,")")
                print("--------------------------------------------------------------------")
                
            display(df_resume.sort_values("records", ascending=False))
            
            pd.reset_option("display.max_rows") # reset max of showing rows
            pd.reset_option("display.max_columns") # reset max of showing cols
            pd.reset_option("display.max_colwidth") # reset width of showing cols
            pd.reset_option("display.float_format") # reset show full content in cell
            
        # deleting dataframe to free memory
        if type_analysis == "complete":
            del [[df_resume, df_desc]]
            gc.collect()
            df_resume, df_desc = (pd.DataFrame() for i in range(2))
        else:
            del df_resume
            gc.collect()
            df_resume = pd.DataFrame()

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">2. Importing files</h3>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">2.1. Importing and preparing files</h4>
</div>

In [3]:
data = pd.read_csv(r"datasets\df_cleaned_final.csv")

In [4]:
data.head()

Unnamed: 0,NumberofFloors,NumberofBuildings,BuildingAge,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),Neighborhood,renamed_PropertyType,SiteEnergyUse(kBtu),GHGEmissions(MetricTonsCO2e)
0,12,1,94,88434,0,88434,DOWNTOWN,Hotel,6981428.0,249.43
1,11,1,25,103566,15064,88502,DOWNTOWN,Hotel,8354235.0,263.51
2,41,1,52,956110,196718,759392,DOWNTOWN,Hotel,73130656.0,2061.48
3,10,1,95,61320,0,61320,DOWNTOWN,Hotel,28229320.0,1936.34
4,18,1,41,175580,62000,113580,DOWNTOWN,Hotel,14829099.0,507.7


In [5]:
df_analysis(data, "data", type_analysis="complete")


Analysis of data dataset
--------------------------------------------------------------------
- Dataset shape:                  1656 rows and 10 columns
- Total of NaN values:            0
- Percentage of NaN:              0.0 %
- Total of full duplicates rows:  0
- Total of empty rows:            0
- Total of empty columns:         0
- Unique indexes:                 True

- Type object and records by columns      ( memory usage: 129.5+ KB )
--------------------------------------------------------------------


Unnamed: 0,name,type,records,unique,# NaN,% NaN,mean,min,25%,50%,75%,max,std
0,NumberofFloors,int64,1656,42,0,0.0,4.13285,1.0,1.0,2.0,4.0,99.0,6.53546
1,NumberofBuildings,int64,1656,11,0,0.0,1.13164,1.0,1.0,1.0,1.0,111.0,2.91856
2,BuildingAge,int64,1656,113,0,0.0,59.22645,6.0,32.0,56.0,91.0,121.0,32.65619
3,PropertyGFATotal,int64,1656,1577,0,0.0,119192.86171,11285.0,29398.5,49289.5,105070.75,9320156.0,298038.52446
4,PropertyGFAParking,int64,1656,326,0,0.0,13321.64251,0.0,0.0,0.0,0.0,512608.0,43664.74773
5,PropertyGFABuilding(s),int64,1656,1572,0,0.0,105871.2192,3636.0,28356.0,47484.5,94479.0,9320156.0,284529.39591
6,Neighborhood,object,1656,13,0,0.0,,,,,,,
7,renamed_PropertyType,object,1656,8,0,0.0,,,,,,,
8,SiteEnergyUse(kBtu),float64,1656,1655,0,0.0,8563131.38206,11441.0,1222923.5,2512737.0,7009066.0,873923712.0,30439268.96365
9,GHGEmissions(MetricTonsCO2e),float64,1656,1583,0,0.0,186.12812,0.08,19.5275,48.84,139.35,16870.98,748.43963


<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">3. Splitting data (SiteEnergyUse(kBtu)</h3>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">3.1. Splitting dataset based on the features and targets</h4>
</div>

In [6]:
data.head()

Unnamed: 0,NumberofFloors,NumberofBuildings,BuildingAge,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),Neighborhood,renamed_PropertyType,SiteEnergyUse(kBtu),GHGEmissions(MetricTonsCO2e)
0,12,1,94,88434,0,88434,DOWNTOWN,Hotel,6981428.0,249.43
1,11,1,25,103566,15064,88502,DOWNTOWN,Hotel,8354235.0,263.51
2,41,1,52,956110,196718,759392,DOWNTOWN,Hotel,73130656.0,2061.48
3,10,1,95,61320,0,61320,DOWNTOWN,Hotel,28229320.0,1936.34
4,18,1,41,175580,62000,113580,DOWNTOWN,Hotel,14829099.0,507.7


In [7]:
X = data.drop(columns=["SiteEnergyUse(kBtu)", "GHGEmissions(MetricTonsCO2e)"])

In [8]:
y_energy = data.iloc[:,-2]

In [9]:
y_co2 = data.iloc[:,-1]

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">3.2. Train/Test split</h4>
</div>

<div class="alert alert-block alert-info">
    At this point, we are going to predict <b>SiteEnergyUse(kBtu)</b>
</div>

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y_energy, test_size=0.3, random_state=0)

In [11]:
print("X_train: {}".format(X_train.shape))
print("y_train: {}".format(y_train.shape))

X_train: (1159, 8)
y_train: (1159,)


In [12]:
X_train.head()

Unnamed: 0,NumberofFloors,NumberofBuildings,BuildingAge,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),Neighborhood,renamed_PropertyType
593,2,1,55,123000,0,123000,GREATER DUWAMISH,Warehouse
930,1,1,46,28126,0,28126,GREATER DUWAMISH,Warehouse
838,3,1,103,38148,0,38148,DOWNTOWN,Office
649,3,1,79,64630,0,64630,DOWNTOWN,Office
529,5,1,23,110524,0,110524,MAGNOLIA / QUEEN ANNE,Office


<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">3.3. Identifying types of features</h4>
</div>

In [13]:
NUMERICAL_FEATURES = ["NumberofFloors", "NumberofBuildings", "BuildingAge", "PropertyGFATotal", "PropertyGFAParking", "PropertyGFABuilding(s)"]
CATEGORICAL_FEATURES = ["Neighborhood", "renamed_PropertyType"]

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">4. Data preprocessing</h3>
</div>

<div class="alert alert-block alert-info">
     We are going to process the features as follows<br><br>
    <b>Numerical features</b>
    <ul style="list-style-type: square;">
        <li>Transforming all features to log due to all of them do not have a normal distribution</li>
        <li>Normalizing the features through StandardScaler due to all of them do not have a normal distribution</li>
    </ul> 
    <b>Categorical features</b>
    <ul style="list-style-type: square;">
        <li>Transforming all features through OneHotEncoder</li>
    </ul>  
</div>

In [14]:
numerical_pipeline = make_pipeline(FunctionTransformer(np.log1p),
                                   StandardScaler())

categorical_pipeline = make_pipeline(OneHotEncoder())

In [15]:
preprocessor = make_column_transformer(
                (numerical_pipeline, NUMERICAL_FEATURES), 
                (categorical_pipeline, CATEGORICAL_FEATURES))

<div class="alert alert-block alert-info">
    Log transformation of target variable based on the analysis done
</div>

In [16]:
y_train_transformed = y_train.apply(np.log1p)

In [17]:
y_test_transformed = y_test.apply(np.log1p)

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">5. Modelisation</h3>
</div>

In [34]:
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [35]:
regressors = {
    "Dummy" : DummyRegressor(),
    "Linear" : LinearRegression(),
    "Bayesian Ridge" : BayesianRidge(),
    "Ridge" : Ridge(),
    "Lasso" : Lasso(alpha=1e-4),
    "KNeighbors" : KNeighborsRegressor(n_neighbors=2), 
    "Random Forest" : RandomForestRegressor(),
    "SVR" : SVR(kernel="rbf", gamma="auto", C=8),
    "Elastic Net" : ElasticNet(),
    "Bagging" : BaggingRegressor(),
    "AdaBoostRegressor" : BaggingRegressor(),
    "Extra Tree" : ExtraTreesRegressor(n_estimators=100, n_jobs=-1),
    "Gradient Boosting" : GradientBoostingRegressor(n_estimators=100, random_state=0)
}


In [None]:
# Dictionnaire de regresseurs
regressors = {
    'Dummy': DummyRegressor(),
    'Linear': LinearRegression(),
    'Ridge' : Ridge(),
    'Lasso' : Lasso(),
    'SVR': SVR(kernel="rbf", C=300, gamma=1),
    'Random Forest': RandomForestRegressor(n_estimators=100, n_jobs=-1),    
    
    
    'Elastic Net': ElasticNet(),
    'Extra Tree': ExtraTreesRegressor(n_estimators=100, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state = 42),
}

In [20]:
for regressor in regressors:
    
    print(regressor)
    model = make_pipeline(preprocessor, regressor)
    model.fit(X_train, y_train_transformed)
    y_pred = model.predict(X_test)
    
    print("Test set score : {:.2f}%".format(np.mean(y_pred == y_test_transformed)))
    print("Test set score : {:.2f}%".format(model.score(X_test, y_test_transformed)))
    print("\n")

KNeighborsRegressor(n_neighbors=2)
Test set score : 0.00%
Test set score : 0.50%


LinearRegression()
Test set score : 0.00%
Test set score : 0.65%


Ridge()
Test set score : 0.00%
Test set score : 0.65%


Lasso(alpha=0.0001)
Test set score : 0.00%
Test set score : 0.65%


RandomForestRegressor()
Test set score : 0.00%
Test set score : 0.63%


BaggingRegressor()
Test set score : 0.00%
Test set score : 0.63%


AdaBoostRegressor()
Test set score : 0.00%
Test set score : 0.59%


SVR(C=8, gamma='auto')
Test set score : 0.00%
Test set score : 0.66%




<div style="background-color: #6D83C5;" >
    <h1 style="margin: auto; padding: 20px; color:#fff; ">XXXXXXXXXXXXXXXXXXXXXXXXXXXXX</h1>
</div>

<div class="alert alert-block alert-danger">
    <b>Flago position</b>
</div>

In [21]:
XXXXXXXXX

NameError: name 'XXXXXXXXX' is not defined

<div class="alert alert-block alert-success">
    <b>done</b>
</div>

In [None]:
knn = KNeighborsRegressor(n_neighbors=2)

In [None]:
model = make_pipeline(preprocessor, knn)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print("Test set score : {:.2f}%".format(np.mean(y_pred == y_test)))

In [None]:
print("Test set score : {:.2f}%".format(model.score(X_test, y_test)))

<div class="alert alert-block alert-success">
    <b>done</b>
</div>

<div class="alert alert-block alert-warning">
    <b>done</b>
</div>

In [None]:
# from sklearn.preprocessing import FunctionTransformer

In [None]:
numerical_pipeline = make_pipeline(StandardScaler())
categorical_pipeline = make_pipeline(OneHotEncoder())

In [None]:
preprocessor = make_column_transformer(
                (numerical_pipeline, NUMERICAL_FEATURES), 
                (categorical_pipeline, CATEGORICAL_FEATURES))

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn = KNeighborsRegressor(n_neighbors=2)

In [None]:
model = make_pipeline(preprocessor, knn)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print("Test set score : {:.2f}%".format(np.mean(y_pred == y_test)))

In [None]:
print("Test set score : {:.2f}%".format(model.score(X_test, y_test)))

<div class="alert alert-block alert-warning">
    <b>done</b>
</div>

<div style="background-color: #6D83C5;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">4. Energy prediction</h3>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">4.1. Train/Test split</h4>
</div>

<div class="alert alert-block alert-info">
    At this point, we are going to predict <b>SiteEnergyUse(kBtu)</b>
</div>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_energy, test_size=0.3, random_state=0)

In [None]:
print("X_train: {}".format(X_train.shape))
print("y_train: {}".format(y_train.shape))

In [None]:
X_train.head()

<div class="alert alert-block alert-danger">
    <b>Flago position</b>
</div>

In [None]:
XXXXXXXXX

In [None]:
y_train

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# scaler = StandardScaler()

In [None]:
# y_energy = scaler.fit_transform(data[["SiteEnergyUse(kBtu)"]])

In [None]:
numerical_pipeline = make_pipeline(StandardScaler())
categorical_pipeline = make_pipeline(OneHotEncoder())

In [None]:
preprocessor = make_column_transformer(
                (numerical_pipeline, NUMERICAL_FEATURES), 
                (categorical_pipeline, CATEGORICAL_FEATURES))

In [None]:
knn = KNeighborsRegressor(n_neighbors=2)

In [None]:
model = make_pipeline(preprocessor, knn)

In [None]:
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import f1_score

In [None]:
y_pred = model.predict(X_test)

In [None]:
print("Test set score : {:.2f}%".format(np.mean(y_pred == y_test)))

In [None]:
print("Test set score : {:.2f}%".format(knn.score(X_test, y_test)))