In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Applying column drops like Ethan's EDA to simplify data

In [None]:
raw_data = pd.read_csv('/kaggle/input/ashrae-global-thermal-comfort-database-ii/ashrae_db2.01.csv')
data = raw_data.copy()
drop_col = ['Climate','Publication (Citation)','Data contributor','Operative temperature (F)','Radiant temperature (F)','Globe temperature (F)','Outdoor monthly air temperature (F)','Velocity_l (fpm)','Velocity_m (fpm)','Velocity_h (fpm)','Tg_l (F)','Tg_m (F)','Tg_h (F)','Ta_l (F)','Ta_m (F)','Ta_h (F)','Air temperature (F)','Air velocity (fpm)']
data = data.drop(drop_col,axis=1)
data.head()

Exploring what is in the data set:
* What (what type of building is this, what is its purpose)
* When (what time of year are the data points from, when was the building constructed?)
* Where (where is the building located, what is the climate like around it?)

In [None]:
import seaborn as sns
sns.set()
data['Season'].value_counts().plot(kind='barh', figsize=(20,6))

In [None]:
data['Koppen climate classification'].value_counts().plot(kind='barh', figsize=(20,6))

In [None]:
ax = data['Year'].plot(kind='hist')

In [None]:
ax = data['Age'].plot(kind='hist')

In [None]:
data['Building type'].value_counts().plot(kind='barh', figsize=(20,6))

In [None]:
data = data.rename(columns={'PMV': 'Predicted Mean Vote', 'PPD': 'Predicted Percentage Disastisfied', 'SET':'Standard Effective Temp', 'CLO': 'Clothing Insulation', 'Ta_h (C)': 'tempfloor_high (C)', 'Ta_m (C)':'tempfloor_med (C)', 'Ta_l (C)':'tempfloor_low (C)', 'Tg_h (C)':'globetemp_high (C)', 'Tg_m (C)':'globetemp_med (C)','Tg_l (C)':'globetemp_low (C)','velocity_h (m/s)':'velocity_high (m/s)','velocity_m (m/s)':'velocity_med (m/s)','velocity_l (m/s)':'velocity_low (m/s)'})
data.columns

So we know:
* Most of the data is from winter and summer
* There is a range of climates in these locations
* Most data is from the 90's, 2010's
* Mostly office space

Questions:
* Does the age column refer to the building?
* What variables are measuring the effectiveness of these air-conditioners?

For the second question, I want to first look at the "Thermal sensation" column, checking to see what factors correlate most with the sensation numbers.

In [None]:
data.corr()['Thermal sensation'].sort_values(ascending=False).head(10)
#positive correlation

In [None]:
data.corr()['Thermal sensation'].sort_values().head(10)
#negative correlation

Potential Next Step:
* These column names are vague, could we replace them with more descriptive ones?
* Ethan found the column descriptions for his EDA, perhaps next step should be to rename these columns, then run this again
* This will also make it easier to understand what factors best represent air conditioner effectiveness

In [None]:
data['Koppen climate classification'].value_counts()

In [None]:
data['Koppen climate classification'].unique()

## Decreasing the amount of categories

### After doing some research online, I realized climate can be generalized into 5 distinct categories.







1. Tropical
2. Dry
3. Temperate
4. Continental
5. Polar

### This is good, as it allows us to simplify the columns even further

In [None]:
tropical_A = []
dry_B = []
temperate_C = []
continental_D = []
polar_E = []

for climate in data['Koppen climate classification'].unique():
    if climate[0] == 'A':
        tropical_A.append(climate)
    elif climate[0] == 'B':
        dry_B.append(climate)
    elif climate[0] == 'C':
        temperate_C.append(climate)
    elif climate[0] == 'D':
        continental_D.append(climate)
    elif climate[0] == 'E':
        polar_E.append(climate)

In [None]:
print(tropical_A)
print(dry_B)
print(temperate_C)
print(continental_D)
print(polar_E)

### Now, we can convert each of the unique values to their generalized forms

In [None]:
data.loc[data['Koppen climate classification'].isin(tropical_A), 
             'Climate'] = 'Tropical'

In [None]:
data.loc[data['Koppen climate classification'].isin(dry_B), 
             'Climate'] = 'Dry'

In [None]:
data.loc[data['Koppen climate classification'].isin(temperate_C), 
             'Climate'] = 'Temperate'

In [None]:
data.loc[data['Koppen climate classification'].isin(continental_D), 
             'Climate'] = 'Continental'

In [None]:
data.loc[data['Koppen climate classification'].isin(polar_E), 
             'Climate'] = 'Polar'

In [None]:
data.Climate.value_counts()

In [None]:
data['Koppen climate classification'].isin(tropical_A).sum()

In [None]:
data['Koppen climate classification'].isin(dry_B).sum()

In [None]:
data['Koppen climate classification'].isin(temperate_C).sum()

In [None]:
data['Koppen climate classification'].isin(continental_D).sum()

### Here we werea ble to define a new generalized columns for Climate! Compared to the old climate this classification is a lot more generalized

### Now, we can look at any of the other categorical columns to see whether we can change/simplify any of then

In [None]:
data.columns

### There is a mispelled columns, so I fixed that here in the code

In [None]:
data = data.rename(columns={'Cooling startegy_building level':'Cooling_strategy_building_level','Cooling startegy_operation mode for MM buildings': 'Cooling_strategy_operation_mode_for_MM' })

In [None]:
data.select_dtypes(exclude='number').columns

In [None]:
data['City'].unique()

### Simplifying based on lattitude and longitude might be too hard for the follwoing dataset. I also believe that the variable 'Climate should be able to capture that relationship

### After exploring some of the data, it seems that there isn't a lot of things to do with the categorical variable or numerical, so there isn't much we can do there except fixing missing values

In [None]:
data.select_dtypes(include='number').columns

In [None]:
import missingno as msno

msno.matrix(data.select_dtypes(include='number'));

## Looking at numerical columns to explore how to fix missing values
## I am going to first look at the columns with not a lot of missing data

I think we can replace missing values with mean

In [None]:
# We are going to use data_no_na for the rest of handling missing columns
data_no_na = data.copy() 

In [None]:
print('mean: ' + str(data_no_na['Age'].mean()))
print('median: '+ str(data_no_na['Age'].median()))

In [None]:
data['Age'].describe()

In [None]:
data_no_na['Age'] = data_no_na['Age'].fillna(data_no_na['Age'].mean())

## Looks like we were able to replace all the missing age values with mean. Some might argue the median would be a better choice and this is up to the researchers

## The next few parts are me working on other columns:
### As a rule of thumb, if the man and median are close, then I would use the mean, else median. This is up to the researcher's judgement


## Thermal Sensation

In [None]:
print('mean: ' + str(data_no_na['Thermal sensation'].mean()))
print('median: '+ str(data_no_na['Thermal sensation'].median()))

In [None]:
data_no_na['Thermal sensation'].describe()

In [None]:
data_no_na['Thermal sensation'] = data_no_na['Thermal sensation'].fillna(data_no_na['Thermal sensation'].mean())

## CLO

In [None]:
print('mean: ' + str(data_no_na['Clo'].mean()))
print('median: '+ str(data_no_na['Clo'].median()))

In [None]:
data_no_na['Clo'].describe()

## Since median and mean are close enough, I will use the mean

In [None]:
data_no_na['Clo'] = data_no_na['Clo'].fillna(data_no_na['Clo'].mean())

## MET

In [None]:
print('mean: ' + str(data_no_na['Met'].mean()))
print('median: '+ str(data_no_na['Met'].median()))

In [None]:
data_no_na['Met'].describe()

In [None]:
data_no_na['Met'] = data_no_na['Met'].fillna(data_no_na['Met'].mean())

## Air Temperature (C)

In [None]:
print('mean: ' + str(data_no_na['Air temperature (C)'].mean()))
print('median: '+ str(data_no_na['Air temperature (C)'].median()))

In [None]:
data_no_na['Air temperature (C)'].describe()

In [None]:
data_no_na['Air temperature (C)'] = data_no_na['Air temperature (C)'].fillna(data_no_na['Air temperature (C)'].mean())

## Relative Humidity

In [None]:
print('mean: ' + str(data_no_na['Relative humidity (%)'].mean()))
print('median: '+ str(data_no_na['Relative humidity (%)'].median()))

In [None]:
data_no_na['Relative humidity (%)'].describe()

In [None]:
data_no_na['Relative humidity (%)'] = data_no_na['Relative humidity (%)'].fillna(data_no_na['Relative humidity (%)'].mean())

## Air Velocity

In [None]:
print('mean: ' + str(data_no_na['Air velocity (m/s)'].mean()))
print('median: '+ str(data_no_na['Air velocity (m/s)'].median()))

In [None]:
data_no_na['Air velocity (m/s)'].describe()

### This data has some outliers! But it might also because some areas in the states have tornadoes or very high speed wind

In [None]:
data_no_na['Air velocity (m/s)'] = data_no_na['Air velocity (m/s)'].fillna(data_no_na['Air velocity (m/s)'].median())

## Outdoor monthly air temperature 

In [None]:
print('mean: ' + str(data_no_na['Outdoor monthly air temperature (C)'].mean()))
print('median: '+ str(data_no_na['Outdoor monthly air temperature (C)'].median()))

In [None]:
data_no_na['Outdoor monthly air temperature (C)'].describe()

In [None]:
data_no_na['Outdoor monthly air temperature (C)'] = data_no_na['Outdoor monthly air temperature (C)'].fillna(data_no_na['Outdoor monthly air temperature (C)'].mean())

In [None]:
msno.matrix(data_no_na.select_dtypes(include='number'));

## Here we have filled missing data for numerical variables in which it didn't have a lot of missing values

# Recategorization of variables

## After looking at the labels, I realize some of the numerical are not numerical variables

In [None]:
data_no_na['Thermal sensation acceptability'].value_counts()

In [None]:
data_no_na['Air movement acceptability'].value_counts()

## Now, converting these two types

In [None]:
data_no_na['Air movement acceptability'] = data_no_na['Air movement acceptability'].astype('category')

In [None]:
data_no_na['Thermal sensation acceptability'] = data_no_na['Thermal sensation acceptability'].astype('category')

In [None]:
data_no_na.select_dtypes(include='number').columns

# Data segmentation

## For the other data, I think it's important to be careful on filling these data. I recommend segmenting the data to the specific questions and dropping any unecessary columns and working with the data then instead of filling them with the mean or median. This is due to the loss of information of the whole dataset

In [None]:
msno.matrix(data_no_na.select_dtypes(include='object'));