In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', None)

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 14})

# Model
import lightgbm as lgb

# Validation
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import shap

SEED = 42


ccai = pd.read_csv("../input/phase-ii-widsdatathon2022/ccai/ccai/data.csv")
ccai.head()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 6))
sns.countplot(data=ccai, x='type')
plt.show()

ccai.type.value_counts()

We can see that most buildings are of type residential. For the purpose of this analysis, we will only look at residential buildings and exclude other types of buildings.

In [None]:
# Only consider residential buildings
ccai = ccai[ccai.type == 'Résidentiel']

# Drop column 'type' because it now only includes 'Résidentiel'
ccai.drop('type', axis=1, inplace=True)

# Shuffle data
ccai = ccai.sample(frac=1, random_state=SEED).reset_index(drop=True)

We are now left with 43125 datapoints.

In [None]:
ccai.info()

# Close Look At Each Feature

## id
First, let's check if we have repeating `id`s.

In [None]:
temp = ccai.groupby('id')[['id']].count()
temp.columns = ['id_count']
ccai = ccai.merge(temp, on='id', how='left')

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
sns.countplot(data=ccai, x='id_count')
plt.show()

ccai[ccai.id_count > 1].id.nunique()

While most of the buildings in the dataframe are appearing only once, we have 6838 buildings with the same `id` showing up more than once.

Let's check if these repeated `id`s are duplicates or if they can be differentiated.

In [None]:
ccai[ccai.duplicated()]

It looks like they are not duplicates. Let's see how these repeated buildings can be differentiated:

In [None]:
ccai[ccai.id_count > 1].sort_values(by='id').head()

It looks like these repeated buildings have different `address` (house number), `delivery_points`, and `consumption`.

In [None]:
#ccai.drop(['id', 'id_count'], axis=1, inplace=True)
ccai.drop(['id'], axis=1, inplace=True)

## geometry
We can use the `shapely` library to convert the polygon WKT strings to polygons. This library helps visualize the polygons and directly get their area and perimeter.

In [None]:
import shapely.wkt
from shapely.geometry import Polygon

display(shapely.wkt.loads(ccai['geometry'].iloc[0]))
display(shapely.wkt.loads(ccai['geometry'].iloc[16]))
display(shapely.wkt.loads(ccai['geometry'].iloc[45]))
display(shapely.wkt.loads(ccai['geometry'].iloc[28935]))

In [None]:
ccai['geometry_area'] = ccai.geometry.apply(lambda x: shapely.wkt.loads(x).area)
ccai['geometry_perimeter'] = ccai.geometry.apply(lambda x: shapely.wkt.loads(x).length)
ccai['geometry_ratio_area_perimeter'] = ccai['geometry_perimeter'] / ccai['geometry_area']
ccai['geometry_outer_perimeter'] = ccai.geometry.apply(lambda x: shapely.wkt.loads(x).exterior.length)
ccai['geometry_inner_perimeter'] = ccai['geometry_perimeter'] - ccai['geometry_outer_perimeter'] 
ccai['geometry_num_interiors'] = ccai.geometry.apply(lambda x: len(list(shapely.wkt.loads(x).interiors)))
ccai['geometry_area_div_id_counts'] = ccai['geometry_area'] / ccai['id_count']  

#ccai.drop('geometry', axis=1, inplace=True)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))
sns.histplot(ccai['geometry_area'], ax=ax[0])
sns.histplot(ccai['geometry_perimeter'], ax=ax[1])
sns.scatterplot(data=ccai, x='geometry_area', y='geometry_perimeter', ax=ax[2])
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))
sns.histplot(ccai['geometry_area'], ax=ax[0])
sns.histplot(ccai['geometry_area_div_id_counts'], ax=ax[1])
sns.scatterplot(data=ccai, x='geometry_area_div_id_counts', y='geometry_perimeter', ax=ax[2])
plt.tight_layout()
plt.show()

## address
Let's split the `address` into street and house number.

In [None]:
#ccai['house_number'] = ccai.address.apply(lambda x: x.split(' ')[0])
#ccai['street'] = ccai.address.apply(lambda x: (' ').join(x.split(' ')[1:]))

ccai.drop('address', axis=1, inplace=True)

## city_name
Let's see how many different cities we have.

In [None]:
ccai.city_name.value_counts()

There are 1180 unique cities in the dataset. We can see that for some cities we have "ARRONDISSEMENT", which means "district". Let's cluster these to one city name and check again.

In [None]:
ccai['city_name_cluster'] = ccai.city_name.apply(lambda x: x.split(' ')[0] if 'ARRONDISSEMENT' in x else x)
ccai.city_name_cluster.value_counts()

Now we still have 1138 unique cities.

In [None]:
ccai.city_name_cluster = ccai.city_name_cluster.str.replace('È','E')
ccai.city_name_cluster = ccai.city_name_cluster.str.replace('É','E')
ccai.city_name_cluster = ccai.city_name_cluster.str.replace('Ç','C')
ccai.city_name_cluster = ccai.city_name_cluster.str.replace('Ô','O')
ccai.city_name_cluster = ccai.city_name_cluster.str.replace('Â','A')

population = pd.read_csv("../input/population-of-cities-in-france/csvData.csv")
population.columns = ['population', 'city_name_cluster']
population.city_name_cluster = population.city_name_cluster.str.upper()

display(population.head())

ccai = ccai.merge(population, on='city_name_cluster', how='left')

According to https://www.investmentmonitor.ai/analysis/largest-cities-france-investment-population, the top 10 largest cities in France are:
1. Paris with a population of 2.19 million (metropolitan population: 12 million)
2. Marseille with a population of 863,000 (metropolitan population: 1.75 million)
3. Lyon with a population of 516,000 (metropolitan population: 2.31 million)
4. Toulouse with a population of 480,000 (metropolitan population: 1.35 million)
5. Nice with a population of 340,000 (metropolitan population: 1 million)
6. Nantes with a population of 309,000 (metropolitan population: 962,000)
7. Montpellier with a population of 285,000 (metropolitan population: 607,000)
8. Strasbourg with a population of 281,000 (metropolitan population: 786,000)
9. Bordeaux with a population of 254,000 (metropolitan population: 1.23 million)
10. Lille with a population of 233,000 (metropolitan population: 1.19 million)

Except for Bordeuax, we can find the top 10 largest french cities also in the dataset.

In [None]:
ccai.groupby('city_name_cluster').population.mean().to_frame().sort_values(by='population', ascending=False).head(10)

There are 313 cities with only one occurence.

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
sns.histplot(population['population'])
plt.tight_layout()
plt.show()

In [None]:
temp = ccai.city_name_cluster.value_counts().to_frame().reset_index(drop=False)
temp.columns = ['city_name_cluster', 'city_name_cluster_count']

ccai = ccai.merge(temp, on='city_name_cluster')



In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
sns.scatterplot(data=ccai, x='city_name_cluster_count', y='population')
ax.set_ylim([0,500000])
ax.set_xlim([0, 500])

plt.tight_layout()
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

temp = ccai[ccai.population.notna()]
X = temp[['city_name_cluster_count']].values
y = temp[['population']].values


reg = LinearRegression().fit(X, y)
ccai['population_pred'] = np.round(reg.predict(ccai[['city_name_cluster_count']].values))


ccai['population'] = np.where(ccai.population.isna(), ccai.population_pred, ccai.population)
ccai.drop('population_pred', axis=1, inplace=True)



In [None]:
bins =  [0, 150000, 500000, 1000000, 10000000]
ccai['city_type'] = pd.cut(ccai['population'], bins = bins, labels=False)
#ccai['city_type'] = pd.cut(ccai['population'], bins = bins, labels=[f'city_type_{b}' for b in bins[:-1]])

ccai['city_type'].value_counts()

## consumption

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
sns.histplot(ccai['consumption'])
plt.tight_layout()
plt.show()

## delivery_points

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
sns.histplot(ccai['delivery_points'])
plt.tight_layout()
plt.show()

## coords_eobs

In [None]:
ccai['coords_eobs'].nunique()

In [None]:
ccai['coords_eobs'].value_counts()

In [None]:
ccai['coords_eobs_lat'] = ccai['coords_eobs'].apply(lambda x: float(x[1:-1].split(', ')[0]))
ccai['coords_eobs_long'] = ccai['coords_eobs'].apply(lambda x: float(x[1:-1].split(', ')[1]))

In [None]:
pip install basemap

In [None]:
from mpl_toolkits.basemap import Basemap
fig = plt.figure(figsize=(18, 18))
m = Basemap(projection='mill',
            lat_0=50, lon_0=3,
            llcrnrlon=-10, llcrnrlat=38,
            urcrnrlon=13, urcrnrlat=55
           )
m.drawcoastlines()
m.scatter(ccai['coords_eobs_long'].values, ccai['coords_eobs_lat'].values, latlon=True, s=1, c='blue')
plt.show()

In [None]:
temp = ccai.groupby('city_name')[['coords_eobs_lat', 'coords_eobs_long']].mean()
temp.columns = ['city_name_lat', 'city_name_long']

ccai = ccai.merge(temp, on='city_name', how='left')

## qq_dict

In [None]:
import ast

for i in range(1, 13):
    ccai[f'qq_{i}'] = ccai['qq_dict'].apply(lambda x: ast.literal_eval(x)[i])
    #ccai[f'qq_{i}']  = np.where(ccai[f'qq_{i}'] == 0, np.nan, ccai[f'qq_{i}'] )
ccai.drop('qq_dict', axis=1, inplace=True)

In [None]:
fig, ax = plt.subplots(nrows=12, ncols=1, figsize=(8, 40))
for i in range(1, 13):
    sns.histplot(ccai[f'qq_{i}'] , ax=ax[i-1])
    ax[i-1].set_xlim([0, 10000])
    ax[i-1].set_title(f'qq_{i}')
plt.tight_layout()
plt.show()

We can see that we have some peaks for the value 0 for all `qq_dict` columns. Let's investigate it a little bit:

In [None]:
ccai[ccai.qq_1 == 0][[f'qq_{i}' for i in range(1,13)]]

It looks like for some datapoints we simply do not have any meaningful `qq_dict` values. We will create a new feature `qq_sum` to check if we have such a datapoint.

In [None]:
ccai['qq_sum'] = ccai[[f'qq_{i}' for i in range(1,13)]].sum(axis=1)

In [None]:
for i in range(1, 13):
    ccai[f'qq_{i}']  = np.where(ccai['qq_sum'] == 0, np.nan, ccai[f'qq_{i}'] )
ccai.drop('qq_sum', axis=1, inplace=True)

In [None]:
fig, ax = plt.subplots(nrows=12, ncols=1, figsize=(8, 40))
for i in range(1, 13):
    sns.histplot(ccai[f'qq_{i}'] , ax=ax[i-1])
    ax[i-1].set_xlim([0, 10000])
    ax[i-1].set_title(f'qq_{i}')
plt.tight_layout()
plt.show()

In [None]:
qq_city_name = ccai.groupby('city_name')[[f'qq_{i}' for i in range(1,13)]].mean()#.value_counts()
qq_city_name.columns = [f'qq_{i}_pred' for i in range(1,13)]
ccai = ccai.merge(qq_city_name, on='city_name', how='left')

for i in range(1,13):
    ccai[f'qq_{i}'] = np.where(ccai[f'qq_{i}'].isna(), ccai[f'qq_{i}_pred'], ccai[f'qq_{i}'] )
    
ccai.drop([f'qq_{i}_pred' for i in range(1,13)], axis=1, inplace=True)

Let's also create new features:

In [None]:
ccai['qq_mean'] = ccai[[f'qq_{i}' for i in range(1,13)]].mean(axis=1)
ccai['qq_min'] = ccai[[f'qq_{i}' for i in range(1,13)]].min(axis=1)
ccai['qq_max'] = ccai[[f'qq_{i}' for i in range(1,13)]].max(axis=1)
ccai['qq_range'] = ccai['qq_max'] - ccai['qq_min'] 

#Heating season means the period from October through April.
ccai['qq_heating'] = ccai[[f'qq_{i}' for i in [10, 11, 12, 1, 2, 3, 4]]].mean(axis=1)

#ccai = ccai.drop(ccai[[f'qq_{i}' for i in range(1,13)]], axis=1)

## height
We can see that for `height`, we have a peak around the value 0. This seems to be an implausible value since 
> `height`: Building height (ground to lowest roof point)

We will replace 0 values with NaN.

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
sns.histplot(ccai['height'])
plt.show()

In [None]:
ccai.height = np.where(ccai.height == 0, np.nan, ccai.height)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
sns.histplot(ccai['height'])
plt.show()

## age
For the column `age`, we can see in the histogram that it is of data type object and represents a date in the form of a string. We will convert it to a number.

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 5))
sns.countplot(x=ccai['age'])
plt.tight_layout()
plt.show()

In [None]:
ccai['age'] = ccai['age'].apply(lambda x: int(x.split('-')[0]) if x==x else x)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
sns.histplot(ccai['age'])
plt.tight_layout()
plt.show()

## floors

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 6))
sns.countplot(x=ccai.floors)
plt.show()

0 floors seems to be an invalid value. Let's change it to 1 where the number of floors is 0.

In [None]:
ccai['floors'] = np.where(ccai.floors == 0, 1, ccai.floors)

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 6))
sns.countplot(x=ccai.floors)
plt.show()

## alt_prec
> `alt_prec`: Altimetric precision of the building height

In [None]:
sns.countplot(x=ccai['alt_prec'])

In [None]:
ccai.alt_prec = np.where(ccai.alt_prec == 9999, np.nan, ccai.alt_prec)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
sns.kdeplot(ccai[ccai.alt_prec == 1]['height'], label='1')
sns.kdeplot(ccai[ccai.alt_prec == 1.5]['height'], label='1.5')
sns.kdeplot(ccai[ccai.alt_prec == 2.5]['height'], label='2.5')
sns.kdeplot(ccai[ccai.alt_prec.isna()]['height'], label='9999')
plt.legend()
plt.show()

In [None]:
#ccai['height_plus_alt_prec'] = ccai['height'] + ccai['alt_prec'] 
#ccai['height_minus_alt_prec'] = ccai['height'] - ccai['alt_prec'] 

In [None]:
sns.countplot(x=ccai['alt_prec'])

## wall_mat and roof_mat
`wall_mat` and `roof_mat` seem to be the materials in an encoded format. We will convert them to an object format since materials don't have a numerical order.

In [None]:
ccai.wall_mat = ccai.wall_mat.apply(lambda x: str(int(x)) if x==x else x)
ccai.roof_mat = ccai.roof_mat.apply(lambda x: str(int(x)) if x==x else x)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 6))
temp = ccai.wall_mat.astype(float).unique()
temp.sort()
temp = temp[:-1].astype(int)
order = [str(c) for c in temp]
sns.countplot(data=ccai, x='wall_mat', order=order)
plt.show()

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 6))
temp = ccai.roof_mat.astype(float).unique()
temp.sort()
temp = temp[:-1].astype(int)
order = [str(c) for c in temp]
sns.countplot(data=ccai, x='roof_mat', order=order)
plt.show()

We can see that the `wall_mat` and `roof_mat` have peak for values which are multiples of 10. This looks like `wall_mat` and `roof_mat` have a main category and a sub category. Let's split those and create new features:

In [None]:
ccai['wall_mat_main'] = ccai.wall_mat.apply(lambda x: str(int(float(x)/10)) if x==x else x)
ccai['roof_mat_main'] = ccai.roof_mat.apply(lambda x: str(int(float(x)/10)) if x==x else x)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))
temp = ccai.wall_mat_main.astype(float).unique()
temp.sort()
temp = temp[:-1].astype(int)
order = [str(c) for c in temp]
sns.countplot(data=ccai, x='wall_mat_main', order=order)
plt.show()

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))
temp = ccai.roof_mat_main.astype(float).unique()
temp.sort()
temp = temp[:-1].astype(int)
order = [str(c) for c in temp]
sns.countplot(data=ccai, x='roof_mat_main', order=order)
plt.show()

In [None]:
ccai['mat'] = ccai[['roof_mat', 'wall_mat']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
ccai['mat_main'] = ccai[['roof_mat_main', 'wall_mat_main']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

We now have looked at each column in the original dataset and modified it to a format we can work with or created new features from it.

# Missing Values
Next, we will check the percentage of missing values.

In [None]:
missing_values = (ccai.isna().sum(axis=0)/len(ccai)*100)
missing_values = missing_values[missing_values>0]
missing_values

## Filling Missing Values for Height and Floors

We can see that there is a strong correlation between the height of a building and the number of floors.

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 6))
sns.scatterplot(data=ccai, x='floors', y='height')
ax.set_ylim([0,70])
ax.set_xlim([0,70])
plt.show()

We will use a simple linear regression model to fill the missing values for height and floors.

In [None]:
ccai['missing_vals'] = ccai.floors.isna() | ccai.height.isna()

from sklearn.linear_model import LinearRegression

temp = ccai[ccai.height.notna() & ccai.floors.notna()]
X = temp[['height']].values
y = temp[['floors']].values


reg = LinearRegression().fit(X, y)
ccai['floors_pred'] = np.round(reg.predict(ccai[['height']].fillna(-999).values))


ccai['floors'] = np.where((ccai.floors.isna() & ccai.height.notna()), ccai.floors_pred, ccai.floors)
ccai.drop('floors_pred', axis=1, inplace=True)




X = temp[['floors']].values
y = temp[['height']].values


reg = LinearRegression().fit(X, y)
ccai['height_pred'] = np.round(reg.predict(ccai[['floors']].fillna(-999).values))


ccai['height'] = np.where((ccai.height.isna() & ccai.floors.notna()), ccai.height_pred, ccai.height)
ccai.drop('height_pred', axis=1, inplace=True)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 6))
sns.scatterplot(data=ccai, x='floors', y='height', hue='missing_vals')
ax.set_ylim([0,70])
ax.set_xlim([0,70])
plt.show()

ccai.drop('missing_vals', axis=1, inplace=True)

In [None]:
missing_values = (ccai.isna().sum(axis=0)/len(ccai)*100)
missing_values = missing_values[missing_values>0]
missing_values

# Feature Engineering

In [None]:
ccai['area_times_floor'] = ccai['geometry_area'] * ccai['floors']
ccai['area_times_floor_div_id_counts'] = ccai['area_times_floor'] / ccai['id_count']

ccai['volume'] = ccai['geometry_area'] * ccai['height']

In [None]:
#import itertools
ccai['geometry_area_bins'] = pd.qcut(ccai['geometry_area'], q = 4, labels=False).astype(object)
ccai['floors_bins'] = pd.cut(ccai['floors'], bins = [0, 1, 2, 4, 6, 100], labels=False)

#bins =  [0, 1700, 1800, 1900, 1920, 1940, 1960, 1980, 2000,]
#ccai['age_bins'] = pd.cut(ccai['age'], bins = bins, labels=[f'age_{b}' for b in bins[:-1]])

variables = ['floors_bins', 'geometry_area_bins',]

"""
i = 0
for L in range(2, len(columns)+1):
    for subset in itertools.combinations(columns, L):
        variables = list(subset)
        #variables = [item for sublist in variables for item in sublist]
        print(f"comb_{i}_{'_'.join(variables)}")
        ccai[f"comb_{i}_{'_'.join(variables)}"] = ccai[variables].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
        i = i + 1
"""
ccai["building_class"] = ccai[variables].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

ccai["building_class"] = np.where(ccai["building_class"].str.contains('nan'), np.nan, ccai["building_class"] )
# Drop bin columns
bin_cols = [#'age_bins',
            'geometry_area_bins', 'floors_bins']

ccai.drop(bin_cols, axis = 1, inplace = True)


In [None]:
ccai.building_class.nunique()

In [None]:
ccai.building_class.value_counts()

In [None]:
for i in range(5):
    c = ccai.building_class.value_counts().index[i]
    print(c)

    for j in range(5):
        geom = ccai[ccai.building_class == c].iloc[j].geometry

        display(shapely.wkt.loads(geom))


# Relationships

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 20))
sns.heatmap(ccai.corr(), annot=True, vmin=-1, vmax=1, fmt = '.1f', cmap='coolwarm')
plt.show()

In [None]:
temp = ccai.groupby(['roof_mat_main', 'wall_mat_main'])[['roof_mat']].count().reset_index(drop=False)
temp = temp.pivot(index='roof_mat_main', columns='wall_mat_main').roof_mat

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 8))
sns.heatmap(temp, annot=True, vmin=0, vmax=3000, fmt = '5.0f',
            cmap='Blues')
plt.show()

In [None]:
temp = ccai.groupby(['roof_mat_main', 'wall_mat_main'])[['consumption']].mean().reset_index(drop=False)
temp = temp.pivot(index='roof_mat_main', columns='wall_mat_main').consumption

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 8))
sns.heatmap(temp, annot=True, vmin=0, vmax=200, fmt = '5.0f',
            cmap='Blues')
plt.show()

# Geographical

In [None]:
top_10_cities = ['PARIS', 'MARSEILLE', 'LYON', 'TOULOUSE', 'NICE', 'NANTES', 'MONTPELLIER', 'STRASBOURG', 'BORDEAUX', 'LILLE']

cities = ccai[ccai.city_name_cluster.isin(top_10_cities)].groupby('city_name_cluster')[['city_name_lat', 'city_name_long', 'population']].mean()
cities.sort_values(by='population', ascending=False)

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(20, 20))
sns.scatterplot(data=ccai, x='coords_eobs_long', y='coords_eobs_lat', hue='qq_mean', palette='coolwarm', ax=ax[0,0])
sns.scatterplot(data=ccai, x='coords_eobs_long', y='coords_eobs_lat', hue='qq_range', palette='coolwarm', ax=ax[0,1])
sns.scatterplot(data=ccai, x='coords_eobs_long', y='coords_eobs_lat', hue='qq_min', palette='coolwarm', ax=ax[1,0])
sns.scatterplot(data=ccai, x='coords_eobs_long', y='coords_eobs_lat', hue='qq_max', palette='coolwarm', ax=ax[1,1])


sns.scatterplot(data=cities, x='city_name_long', y='city_name_lat', color='black', ax=ax[0,0])
sns.scatterplot(data=cities, x='city_name_long', y='city_name_lat', color='black', ax=ax[0,1])
sns.scatterplot(data=cities, x='city_name_long', y='city_name_lat', color='black', ax=ax[1,0])
sns.scatterplot(data=cities, x='city_name_long', y='city_name_lat', color='black', ax=ax[1,1])

for line in range(0,cities.shape[0]):
    ax[0,0].text(cities.city_name_long[line]+0.2, cities.city_name_lat[line], cities.index[line], horizontalalignment='left', size='medium', color='black')#, weight='semibold')
    ax[0,1].text(cities.city_name_long[line]+0.2, cities.city_name_lat[line], cities.index[line], horizontalalignment='left', size='medium', color='black')#, weight='semibold')
    ax[1,0].text(cities.city_name_long[line]+0.2, cities.city_name_lat[line], cities.index[line], horizontalalignment='left', size='medium', color='black')#, weight='semibold')
    ax[1,1].text(cities.city_name_long[line]+0.2, cities.city_name_lat[line], cities.index[line], horizontalalignment='left', size='medium', color='black')#, weight='semibold')

plt.show()

In [None]:
"""
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    
    #Function copied from https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
    #Calculate the great circle distance between two points 
    #on the earth (specified in decimal degrees)
    

    R = 6372.8 # Earth radius in kilometers

    dLat = radians(lat2 - lat1)
    dLon = radians(lon2 - lon1)
    lat1 = radians(lat1)
    lat2 = radians(lat2)

    a = sin(dLat/2)**2 + cos(lat1)*cos(lat2)*sin(dLon/2)**2
    c = 2*asin(sqrt(a))

    return round(R * c, 0)


for city in cities.index:

    distance_to_large_city = ccai.groupby('city_name')[['city_name_lat', 'city_name_long']].mean().apply(lambda x: haversine(x[1], x[0], cities.loc[city].city_name_long, cities.loc[city].city_name_lat), axis=1).to_frame()
    distance_to_large_city.columns = [f'distance_to_{city}']
    distance_to_large_city
    ccai = ccai.merge(distance_to_large_city, on ='city_name', how='left')
    
    temp = ccai.groupby('city_name')[['city_name_long', 'city_name_lat', f'distance_to_{city}']].mean()

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
    sns.scatterplot(data=temp, x='city_name_long', y='city_name_lat', hue=f'distance_to_{city}', palette='Blues_r')

    sns.scatterplot(data=cities, x='city_name_long', y='city_name_lat', color='black')

    for line in range(0,cities.shape[0]):
         plt.text(cities.city_name_long[line]+0.2, cities.city_name_lat[line], cities.index[line], horizontalalignment='left', size='medium', color='black')#, weight='semibold')

    plt.show()
"""

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler


# #############################################################################
X = ccai[['coords_eobs_long', 'coords_eobs_lat']].values
X = StandardScaler().fit_transform(X)

# #############################################################################
# Compute DBSCAN
db = DBSCAN(eps=0.15, min_samples=50).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))

for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = labels == k

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(
        xy[:, 0],
        xy[:, 1],
        "o",
        markerfacecolor=tuple(col),
        markeredgecolor="k",
        markersize=14,
    )

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(
        xy[:, 0],
        xy[:, 1],
        "o",
        markerfacecolor=tuple(col),
        markeredgecolor="k",
        markersize=6,
    )

plt.title("Estimated number of clusters: %d" % n_clusters_)
plt.show()

ccai['neighbourhood_id'] = db.labels_.astype(str)

# Predict Consumption

In [None]:
target = 'consumption'
id_col = 'id'

from sklearn.model_selection import train_test_split
train, test = train_test_split(ccai, test_size=0.1, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

y_test = test[target]
test.drop(target, axis=1, inplace=True)

In [None]:
exclude_cols = ['geometry', 'coords_eobs'] 

In [None]:
categorical_features = [c for c in ccai.columns if ((ccai[c].dtype == object) & (c not in exclude_cols) )]
print(categorical_features)

ccai  = pd.concat([train, test], axis=0).reset_index(drop=True)

for feature in (categorical_features):
    temp = ccai.groupby(feature)[target].agg(['count', 'mean', 'std'])
    temp.columns = [f'{feature}_{c}' for c in temp.columns]
    if feature == 'neighbourhood_id':
        temp = temp[temp.index != "-1"]
    temp[f'{feature}_mean'] = np.where(temp[f'{feature}_count'] <= 1, np.nan, temp[f'{feature}_mean'])
    temp[f'{feature}_std'] = np.where(temp[f'{feature}_count'] <= 1, np.nan, temp[f'{feature}_std'])  
    ccai = ccai.merge(temp[[f'{feature}_mean', f'{feature}_std', f'{feature}_count']], on=feature, how='left')
    ccai = ccai.drop(feature, axis =1)
    
train = ccai[:len(train)]
test = ccai[len(train):]

In [None]:

features = [c for c in train.columns if ((c != target) & (c != id_col) & (c not in exclude_cols))]
display(features)

In [None]:
X = train[features]
y = train[target]
y_oof_pred = np.zeros(len(y))

X_test = test[features].values
y_test_pred = np.zeros(len(X_test))


In [None]:
X.info()

In [None]:
missing_values = (X.isna().sum(axis=0)/len(ccai)*100)
missing_values = missing_values[missing_values>0]
missing_values

In [None]:

error = 0

num_folds = 5
kf = KFold(n_splits = num_folds)

for i, (train_index, val_index) in enumerate(kf.split(X, y)):  
    print(f"Fold {i+1}")

    X_train = X.loc[train_index].values
    X_val = X.loc[val_index].values
    y_train = y[train_index].values
    y_val = y[val_index].values

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val)

    params_lgb = {
        'objective' : 'regression',
        'metric': 'rmse',
        'force_col_wise': True,
        'seed' : SEED,
        'num_iterations' : 15000,
        'early_stopping_rounds' : 250,
        'learning_rate' : 0.05,
        'max_depth' : 8,
        'num_leaves' : 32,
        'feature_fraction': 0.9,
        'lambda_l1' : 0.1,
        'lambda_l2' : 0.1,
        'verbose' : -1
    }

    model = lgb.train(params_lgb,
                      lgb_train,
                      valid_sets = (lgb_train, lgb_eval),
                      verbose_eval = 500,
                     )


    y_val_pred = model.predict(X_val)
    y_test_pred += model.predict(X_test)
    
    rmse_fold = np.sqrt(mean_squared_error(y_val_pred, y_val))
    error += rmse_fold
    y_oof_pred[val_index] = y_val_pred


print(f"Mean RMSE: {error/num_folds}")
print(f"Mean RMSE: {np.sqrt(mean_squared_error(y_oof_pred, y))}")


# Calculate evaluation metric for out of fold validation set
y_test_pred = y_test_pred / num_folds
sns.kdeplot(train[target], label='y_train')
sns.kdeplot(y_test, label='y_test')
sns.kdeplot(y_test_pred, label='y_test_pred')
plt.legend()
plt.show()
print(f"Mean RMSE: {np.sqrt(mean_squared_error(y_test_pred, y_test))}")


> `importance_type` (str, optional (default="split")) – How the importance is calculated. If “split”, result contains numbers of times the feature is used in a model. If “gain”, result contains total gains of splits which use the feature.

In [None]:
# Copied from https://www.kaggle.com/usharengaraju/wids2022-lgbm-starter-w-b

feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(importance_type='split'), features),reverse = True), columns=['Value','Feature'])
feature_imp = feature_imp[feature_imp.Value != 0]
plt.figure(figsize=(16, 16))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False).head(30), 
            palette='Blues')
plt.title('LightGBM Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
# Copied from https://www.kaggle.com/usharengaraju/wids2022-lgbm-starter-w-b

feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(importance_type='gain'), features),reverse = True), columns=['Value','Feature'])
feature_imp = feature_imp[feature_imp.Value != 0]
plt.figure(figsize=(16, 16))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False).head(30), 
            palette='Blues')
plt.title('LightGBM Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
# Create Explainer and get shap_values

explainer = shap.TreeExplainer(model)
shap_values = explainer(X)
shap.plots.bar(shap_values, max_display=50)


# Analysis

In [None]:
"""for col in [ 'delivery_points', #'type', 
            'age', 'height', 
       'floors',  'geometry_area', 'wall_mat', 'roof_mat',
       'wall_mat_main', 'roof_mat_main', 'qq_mean', 'qq_min', 'qq_max', 
           'area_per_floor']:
    if (ccai[col].dtype == 'object') | (ccai[col].nunique() <30):
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
        sns.boxplot(data=ccai, x=col, y='consumption')
        plt.show()
    else:
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
        sns.scatterplot(data=ccai, x=col, y='consumption',  marker='.')
        plt.show()"""

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 6))
sns.boxplot(data=ccai, x='delivery_points', y='consumption')
plt.show()

In [None]:
temp = ccai.groupby('city_name')[['city_name_long', 'city_name_lat', 'consumption']].mean()

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
sns.scatterplot(data=temp, x='city_name_long', y='city_name_lat', hue='consumption', palette='coolwarm')

sns.scatterplot(data=cities, x='city_name_long', y='city_name_lat', color='black')

for line in range(0,cities.shape[0]):
     plt.text(cities.city_name_long[line]+0.2, cities.city_name_lat[line], cities.index[line], horizontalalignment='left', size='medium', color='black')#, weight='semibold')

plt.show()

In [None]:
bins =  [0, 1700, 1800, 1850, 1860, 1870, 1880, 1890, 1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
ccai['age_bins'] = pd.cut(ccai['age'], bins = bins, labels=[f'age_{b}' for b in bins[:-1]])#.astype(object)

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(30, 6))
sns.countplot(x=ccai.age_bins, ax=ax[0])
sns.boxplot(data=ccai, x='age_bins', y='consumption', ax=ax[1])
plt.tight_layout()
plt.show()

# WORK IN PROGRESS