In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import warnings
warnings.filterwarnings('ignore')
import gc
from scipy.stats import skew
from scipy.stats import norm
from scipy import stats
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import matplotlib.patches as patches

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import pathlib
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != 'object' and col_type != 'datetime64[ns]':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)  # feather-format cannot accept float16
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
data_dir = pathlib.Path('../input/ashrae-energy-prediction')

df_building = pd.read_csv(data_dir.joinpath('building_metadata.csv'))
df_weather_train = pd.read_csv(data_dir.joinpath('weather_train.csv'))
df_weather_test = pd.read_csv(data_dir.joinpath('weather_test.csv'))
df_train = pd.read_csv(data_dir.joinpath('train.csv'))
df_test = pd.read_csv(data_dir.joinpath('test.csv'))

In [None]:
df_building = reduce_mem_usage(df_building)
df_weather_train = reduce_mem_usage(df_weather_train)
df_weather_test = reduce_mem_usage(df_weather_test)
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

In [None]:
df_weather_train["datetime"] = pd.to_datetime(df_weather_train["timestamp"])
df_weather_train["day"] = df_weather_train["datetime"].dt.day
df_weather_train["week"] = df_weather_train["datetime"].dt.week
df_weather_train["month"] = df_weather_train["datetime"].dt.month
df_weather_train["year"] = df_weather_train["datetime"].dt.year

In [None]:
df_weather_test["datetime"] = pd.to_datetime(df_weather_test["timestamp"])
df_weather_test["day"] = df_weather_test["datetime"].dt.day
df_weather_test["week"] = df_weather_test["datetime"].dt.week
df_weather_test["month"] = df_weather_test["datetime"].dt.month
df_weather_test["year"] = df_weather_test["datetime"].dt.year

In [None]:
print('Train Shape:', df_train.shape)
print('Test Shape:', df_test.shape)
print('Weather Train Shape:', df_weather_train.shape)
print('Weather Test Shape:', df_weather_test.shape)
print('Building Metadata Shape:', df_building.shape)

In [None]:
display(df_train.head())
df_train.dtypes

In [None]:
display(df_test.head())
df_test.dtypes

In [None]:
display(df_weather_train.head())
df_weather_train.dtypes

In [None]:
display(df_weather_test.head())
df_weather_test.dtypes

In [None]:
display(df_building.head())
df_building.dtypes

In [None]:
df_train = df_train.merge(df_building, on='building_id', how='left')
df_train = df_train.merge(df_weather_train, on=['site_id', 'timestamp'], how='left')

df_test = df_test.merge(df_building, on='building_id', how='left')
df_test = df_test.merge(df_weather_test, on=['site_id', 'timestamp'], how='left')

del df_building, df_weather_train, df_weather_test

In [None]:
print('Train Shape:', df_train.shape)
df_train.head()

In [None]:
df_train = df_train.drop(['timestamp','day','week','month','year'],axis=1)

In [None]:
df_train

In [None]:
df_train.dtypes
df_train.dtypes.unique()

In [None]:
df_train.info()

**Missing Value imputation**

In [None]:
df_train.isnull().sum()[df_train.isnull().sum() !=0]


#Les colonnes énumérées ci-dessous ont des valeurs manquantes dans l'ensemble de données combiné (train + test).

In [None]:
#Permet de visualiser le pourcentage d'entités manquantes dans le train
missing= df_train.isnull().sum()[df_train.isnull().sum() !=0]
missing=pd.DataFrame(missing.reset_index())
missing.rename(columns={'index':'features',0:'missing_count'},inplace=True)
missing['missing_count_percentage']=((missing['missing_count'])/20216100)*100
plt.figure(figsize=(20,8))
sns.barplot(y=missing['features'],x=missing['missing_count_percentage'])

les variables Year_built et floor_count represente plus de 60% de valeurs manquantes, pour cela nous alons les supprimer de notre jeu de donnée vu qu'ils n'apporte pas assez d'information.


1- supprimer les lignes avec des valeurs manquantes et voir les performances du modèle
2- impute les valeurs manquantes avec moyenne et médiane ou peut être mode.

In [None]:
#Permet de voir la propagation des données avant d'imputer les valeurs manquantes
plt.plot(figsize=(15,10))
sns.boxplot(df_train['wind_speed'])

Wind_speed semble avoir beaucoup de points aberrants - La médiane devrait avoir raison d'imputer les valeurs manquantes

In [None]:
df_train['wind_speed'].isna().sum()

In [None]:
df_train['wind_speed'].fillna(df_train['wind_speed'].median(),inplace=True) 
# imputer avec Meadian, car il y a beaucoup de valeurs aberrantes
df_test['wind_speed'].fillna(df_test['wind_speed'].median(),inplace=True) 

In [None]:
#le test nous montre qu'il y a plus de N/A
df_train['wind_speed'].isna().sum()
df_train['wind_speed'].describe()

In [None]:
sns.boxplot(df_train['wind_direction'])

In [None]:
#imputation par la moyenne serait un moyen pour l'imputation 
df_train['wind_direction'].isna().sum()

In [None]:
df_train['wind_direction'].fillna(df_train['wind_direction'].mean(),inplace=True) 
# imputer avec Meadian, car il y a beaucoup de valeurs aberrantes
df_test['wind_direction'].fillna(df_test['wind_direction'].mean(),inplace=True) 

In [None]:
#le test nous montre qu'il y a plus de N/A
df_train['wind_direction'].isna().sum()
df_train['wind_direction'].describe()

In [None]:
sns.boxplot(df_train['sea_level_pressure'])

In [None]:
#imputation par la mediane serait un bon  moyen 
df_train['sea_level_pressure'].isna().sum()

In [None]:
df_train['sea_level_pressure'].fillna(df_train['sea_level_pressure'].median(),inplace=True) 
# imputer avec Meadian, car il y a beaucoup de valeurs aberrantes
df_test['sea_level_pressure'].fillna(df_test['sea_level_pressure'].median(),inplace=True) 

In [None]:
#le test nous montre qu'il y a plus de N/A
df_train['sea_level_pressure'].isna().sum()
df_train['sea_level_pressure'].describe()

In [None]:
sns.boxplot(df_train['precip_depth_1_hr'])

In [None]:
df_train['precip_depth_1_hr'].isna().sum()

In [None]:
df_train['precip_depth_1_hr'].fillna(df_train['precip_depth_1_hr'].median(),inplace=True) 
# imputer avec Meadian, car il y a beaucoup de valeurs aberrantes
df_test['precip_depth_1_hr'].fillna(df_test['precip_depth_1_hr'].median(),inplace=True) 

In [None]:
#le test nous montre qu'il y a plus de N/A
df_train['precip_depth_1_hr'].isna().sum()
df_train['precip_depth_1_hr'].describe()

In [None]:
sns.boxplot(df_train['dew_temperature'])

In [None]:
df_train['dew_temperature'].isna().sum()

In [None]:
df_train['dew_temperature'].fillna(df_train['dew_temperature'].median(),inplace=True) 
# imputer avec Meadian, car il y a beaucoup de valeurs aberrantes
df_test['dew_temperature'].fillna(df_test['dew_temperature'].median(),inplace=True) 

In [None]:
#le test nous montre qu'il y a plus de N/A
df_train['dew_temperature'].isna().sum()
df_train['dew_temperature'].describe()

In [None]:
sns.boxplot(df_train['cloud_coverage'])

In [None]:
#imputation par la moyenne serait un moyen pour l'imputation 
df_train['cloud_coverage'].isna().sum()

In [None]:
df_train['cloud_coverage'].fillna(df_train['cloud_coverage'].mean(),inplace=True) 
# imputer avec Moyenne, car il y a beaucoup de valeurs aberrantes
df_test['cloud_coverage'].fillna(df_test['cloud_coverage'].mean(),inplace=True) 

In [None]:
#le test nous montre qu'il y a plus de N/A
df_train['cloud_coverage'].isna().sum()
df_train['cloud_coverage'].describe()

In [None]:
sns.boxplot(df_train['air_temperature'])

In [None]:
df_train['air_temperature'].isna().sum()

In [None]:
df_train['air_temperature'].fillna(df_train['air_temperature'].median(),inplace=True) 
# imputer avec Meadian, car il y a beaucoup de valeurs aberrantes
df_test['air_temperature'].fillna(df_test['air_temperature'].median(),inplace=True) 

In [None]:
#le test nous montre qu'il y a plus de N/A
df_train['air_temperature'].isna().sum()
df_train['air_temperature'].describe()

In [None]:
df_train.drop(['year_built','floor_count'],axis=1,inplace=True) 

In [None]:
df_test.drop(['year_built','floor_count'],axis=1,inplace=True)


In [None]:
df_train.isnull().sum()

In [None]:
df_train.head()

In [None]:
df_train.shape,df_test.shape

**Statistique Descriptive**

df_train.describe().plot(kind="area", fontsize=22, figsize=(18,8), table=True, colormap="rainbow")
plt.xlabel('',)
plt.ylabel('value')
plt.title("statistiques générales des variables")

In [None]:
ax = df_train.plot(x='dew_temperature', y='meter_reading', kind='scatter')
ax.set_title("le relevé de compteur par rapport à la temperature du rosé")

In [None]:
# Voir la distribution / histogramme des variables numériques ( )
df_train.loc[:, df_train.columns != 'building_id'].hist(figsize=(30, 20), bins=50);

In [None]:
df_train['primary_use'].value_counts(normalize=True).plot(kind='bar')

**Correlation**

In [None]:
f, ax = plt.subplots(figsize = (15,10))
plt.title('Pearson Correlation of features')
sns.heatmap(df_train.corr(), linewidths = 0.25, vmax = 1.0, square = True, cmap = 'cubehelix', linecolor = 'k', annot = True)

In [None]:
sns.set(font = 'monospace')
cmap = sns.diverging_palette(h_neg = 210, h_pos = 350, s = 90, l = 30, as_cmap = True)
sns.clustermap(df_train.corr(), linewidths = 0.5, figsize = (13,13), cmap = cmap)

In [None]:
%%time
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df_train['meter']= le.fit_transform(df_train['meter']).astype("uint8")
df_test['meter']= le.fit_transform(df_test['meter']).astype("uint8")
df_train['primary_use']= le.fit_transform(df_train['primary_use']).astype("uint8")
df_test['primary_use']= le.fit_transform(df_test['primary_use']).astype("uint8")

In [None]:
df_train

In [None]:
y = df_train['meter_reading']
df_train = df_train.drop(['meter_reading'], axis=1)

In [None]:
df_train

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df_train,y,test_size=0.25,random_state=42)

In [None]:
print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)

In [None]:
import xgboost as xgb
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.linear_model import SGDRegressor, LinearRegression , Ridge , Lasso


In [None]:
#On définit la métrique utilisée pour évaluer l'algorithme.
def rmsle(preds,targets, sample_weight = None, multioutput = 'uniform_average'):
    # on applique la fonction exp afin de revenir au price de départ et avoir une idée réalise de l'erreur 
    return np.sqrt(np.sum((np.log(pred+1)-np.log(targets+1))**2)/len(targets))

def r2(preds, targets) : 
    # on applique la fonction exp afin de revenir au price de départ et avoir une idée réalise de l'erreur 
    preds = np.exp( preds )
    targets =  np.exp(targets)
    return 1- (np.sum((targets-preds)**2)/  np.sum( ( targets -  np.mean(targets))**2))

In [None]:
def evuluate(model):
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    # on affiche  preds = f(y)
    plt.figure()
    plt.scatter(y_test, preds_test)  
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
    plt.xlabel('meter_reading')
    plt.ylabel('prediction')
    plt.title('pred = f(meter_reading)')
    # les résidus
    plt.figure()
    plt.scatter(y_test, abs(preds_test - y_test))  
    plt.xlabel('preds_meter_reading')
    plt.ylabel('risidual')
    plt.title('risidual = f(fitted_meter_reading)')   
    plt.show()
    rmsle_train = rmsle(preds_train, y_train)
    rmsle_test = rmsle(preds_test, y_test)
    r2_train = r2(preds_train, y_train)
    r2_test = r2(preds_test, y_test)   
    return rmsle_train, rmsle_test, r2_train, r2_test

In [None]:
col_names =  ['model name', 'rmsle_train', 'rmsle_test','r2_train', 'r2_test']
results = pd.DataFrame(columns = col_names)
results

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train,y_train)
rmsle_train, rmsle_test, r2_train, r2_test = evuluate(linear_model)
new_row = {'model name':" linear regression", 'rmsle_train':rmsle_train, 'rmsle_test':rmsle_test, 'r2_train':r2_train, 'r2_test':r2_test}
results = results.append(new_row, ignore_index=True)
print("{} train train rmsle : {:.3f} test rmsle : {:.3f} train r2 : {:.3f} test r2  : {:.3f}".format(new_row['model name'],new_row['rmsle_train'], new_row['rmsle_test'] ,new_row['r2_train'], new_row['r2_test']))