In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# **Sex and Age vs Thermal Comfort**
In this note book we will be exploring how different backgrounds can affect how people are thermally effected. 
There are four main kinds of building used in the studies for the ASHRAE thermal data set
1. Schools
    - young children to college age children
2. Office Buildings
    - adults
3. Nursing homes
    - elderly people 
4. Family homes
    - all ages (will need some extra work)
    
 We will be focusing on Nursing homes and Family homes to see if there is any difference between how age and Sex determines thermal sensation. If there is a difference then, we will be exploring what population buildings in these data sets cater more heavily toward. 

# **Cleaning Data**

Before exploring the data, let's make it a little simpler to parse through. I will be removing all columns in Farenheit so we are left with Celcius. 

In [None]:
raw_data = pd.read_csv('/kaggle/input/ashrae-global-thermal-comfort-database-ii/ashrae_db2.01.csv')
data = raw_data.copy()
drop_col = ['Data contributor','Operative temperature (F)','Radiant temperature (F)','Globe temperature (F)','Outdoor monthly air temperature (F)','Velocity_l (fpm)','Velocity_m (fpm)','Velocity_h (fpm)','Tg_l (F)','Tg_m (F)','Tg_h (F)','Ta_l (F)','Ta_m (F)','Ta_h (F)','Air temperature (F)','Air velocity (fpm)']
data = data.drop(drop_col,axis=1)
data.head()

The following code helps rename certain columns to make it more clear.

In [None]:
data = data.rename(columns={'PMV': 'Predicted Mean Vote', 
                            'PPD': 'Predicted Percentage Disatisfied', 
                            'SET':'Standard Effective Temp', 
                            'CLO': 'Clothing Insulation', 
                            'Ta_h (C)': 'tempfloor_high (C)', 
                            'Ta_m (C)':'tempfloor_med (C)', 
                            'Ta_l (C)':'tempfloor_low (C)', 
                            'Tg_h (C)':'globetemp_high (C)', 
                            'Tg_m (C)':'globetemp_med (C)',
                            'Tg_l (C)':'globetemp_low (C)',
                            'velocity_h (m/s)':'velocity_high (m/s)',
                            'velocity_m (m/s)':'velocity_med (m/s)',
                            'velocity_l (m/s)':'velocity_low (m/s)', 
                            'Cooling startegy_building level':'cooling_strategy_building',
                            'Cooling startegy_operation mode for MM buildings':'cooling_strategy_for_mm_buildings',
                            'Building type': 'Building_type'})
data.columns

# Distribution of variables
Now that we have less data to work with, it's good to get a sense right off the bat on how building type is distributed. 

In [None]:
import seaborn as sns
sns.set()
data['Building_type'].value_counts().plot(kind='barh', figsize=(20,6))

Multifamily Housing and Senior centers make up a small percentage of these buildings, so it's good to expect a small dataframe when we separate them later.
We should also directly see the age distribution to check if it aligns with building types

In [None]:
import seaborn as sns
sns.set()
data['Sex'].value_counts().plot(kind='barh', figsize=(30,10))

There are more data from men than women. 

# Handling Missing Values

Now we are going to start cleaning the data, and making two different data frames, one about Senior Centers, and one about Multifamily housing. We will be cleaning those data in slightly different ways to try and make the most comprehensive, but complete data frames for both. 

In [None]:
import missingno as msno

msno.matrix(data.select_dtypes(include='number'));

Lot's of numberical data missing! Let's check categorical data. 

In [None]:
msno.matrix(data.select_dtypes(include='O'));

# Segmenting missing data based off of type of building 
I will be focusing on Retirement Facilities, and Multifamily homes 

In [None]:
data_retirement = data[data.Building_type == 'Senior center']
data_family = data[data.Building_type == 'Multifamily housing']

In [None]:
msno.matrix(data_retirement.select_dtypes(include='number'))

In [None]:
msno.matrix(data_retirement.select_dtypes(include='O'))

In [None]:
msno.matrix(data_family.select_dtypes(include='number'))

In [None]:
msno.matrix(data_family.select_dtypes(include='O'))

In [None]:
msno.matrix(data_family.select_dtypes(include='O'))

So it appears that large swaths of data are missing, moreso in the family homes data set than the retirement data set. Since there are many columns with nothing, it's easier to keep what we want than to drop.

In [None]:
drop_cols = ['Thermal sensation acceptability',
             'Air movement acceptability',
             'activity_10',
             'activity_20',
             'activity_30',
             'activity_60',
             'globetemp_high (C)',
             'globetemp_med (C)',
             'globetemp_low (C)',
             'Subject«s height (cm)',
             'Subject«s weight (kg)',
             'Blind (curtain)',
             'Door',
             'Air movement preference',
             'Humidity preference',
             'Humidity sensation',
             'Publication (Citation)',
             'Database']
            

In [None]:
data_Retirement = data_retirement.drop(drop_cols,axis =1)
data_Retirement = data_Retirement.dropna()
msno.matrix(data_Retirement.select_dtypes(include='number'))

In [None]:
msno.matrix(data_Retirement.select_dtypes(include='O'))

In [None]:
keep_col_1 = ['Year', 
            'Thermal sensation', 
            'Clo', 
            'Met', 
            'Air temperature (C)', 
            'Relative humidity (%)', 
            'Air velocity (m/s)', 
            'Outdoor monthly air temperature (C)',
            'Season', 
            'Climate', 
            'City', 
            'Country',
            'Building_type', 
            'cooling_strategy_building', 
            'Thermal preference', 
            'Thermal sensation acceptability',
            'Sex']

In [None]:
data_Family = data_family[keep_col_1]
data_Family = data_Family.dropna()
msno.matrix(data_Family.select_dtypes(include='number'))

In [None]:
msno.matrix(data_Family.select_dtypes(include='O'))

# Categorical Variables

Here we will be changing categorical variables to something we can use as numberical variables. This will work by splitting up categorical variables into its categories and assigning 0 for false and 1 for true 

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [None]:
object_cols_retirement = ['Season', 'Koppen climate classification', 'Climate','City','Country','Building_type',
                          'cooling_strategy_building','Heating strategy_building level', 'Sex','Thermal preference','Thermal comfort']


In [None]:
OH_cols_retirement = pd.DataFrame(OH_encoder.fit_transform(data_Retirement[object_cols_retirement]))
OH_cols_retirement.index = data_Retirement.index
column_name_retirement = OH_encoder.get_feature_names(object_cols_retirement)
OH_cols_retirement.columns = column_name_retirement

In [None]:
OH_cols_retirement

In [None]:
other_data_retirement = data_Retirement.drop(object_cols_retirement, axis=1)
OH_data_retirement = pd.concat([other_data_retirement, OH_cols_retirement], axis=1)
OH_data_retirement

And now the same for family homes 

In [None]:
object_cols_family = ['Season','Climate','City','Country','Building_type','cooling_strategy_building','Thermal preference','Sex']

In [None]:
OH_cols_family = pd.DataFrame(OH_encoder.fit_transform(data_Family[object_cols_family]))
OH_cols_family.index = data_Family.index
column_name_family = OH_encoder.get_feature_names(object_cols_family)
OH_cols_family.columns = column_name_family

In [None]:
OH_cols_family

In [None]:
other_data_family = data_Family.drop(object_cols_family, axis=1)
OH_data_family = pd.concat([other_data_family, OH_cols_family], axis=1)
OH_data_family

# Group By 

Now we will be able to see if there are any correlations between sex and thermal sensation in these buildings. 
We will do this by grouping those two variables, and then checking for general correlations. 

In [None]:
OH_data_retirement.columns

In [None]:
columns_retirement = OH_data_retirement.select_dtypes(include='number').drop('Thermal sensation',axis=1).columns

data_grouped_retirement = OH_data_retirement.groupby(['Sex_Female','Thermal sensation'])[columns_retirement].mean()

In [None]:
data_grouped_retirement

In [None]:
OH_data_family.columns

In [None]:
columns_family = OH_data_family.select_dtypes(include='number').drop('Thermal sensation',axis=1).columns

data_grouped_family = OH_data_family.groupby(['Sex_Female','Thermal sensation'])[columns_family].mean()

In [None]:
data_grouped_family

In [None]:
OH_data_retirement.corr()['Sex_Female'].sort_values(ascending=False).head(10)

In [None]:
OH_data_family.corr()['Sex_Female'].sort_values(ascending=False).head(10)

There are absolutely no strong correlations between sex and any one column, which makes sense,considering the way our data reports sex is not a sliding scale,and rather a 0 or 1.

In [None]:
violin_1 = sns.violinplot(x="Sex_Female", y="Thermal sensation", data=OH_data_family)

Overall it appears, though comprable that men(0) tended to be more comfortable in the space, while women(1) had a wider range of thermal sensations, however they are still pretty comparable. 

In [None]:
violin_2 = sns.violinplot(x="Sex_Female", y="Thermal sensation", data=OH_data_retirement)

Here the opposite appears, overall still comparable, but men had a wider range of thermal sensation, in retirement homes. 

# Acknowledgements

This notebook was based off of the format layed out by Ethan Go. 