# The Weather Dataset

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv('input/weather.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
Outlook        14 non-null object
Temperature    14 non-null object
Humidity       14 non-null object
Windy          14 non-null bool
Play           14 non-null object
dtypes: bool(1), object(4)
memory usage: 534.0+ bytes


In [4]:
# Analyzing dataset structure
instance_count, attr_count = df.shape
print('Number of instances: %d' % instance_count)
print('Number of attributes: %d' % attr_count)
print('Column names: %s' %  ', '.join(df.columns))

Number of instances: 14
Number of attributes: 5
Column names: Outlook, Temperature, Humidity, Windy, Play


In [5]:
df.sample(5)

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
11,overcast,mild,high,True,yes
1,sunny,hot,high,True,no
13,rainy,mild,high,True,no
9,rainy,mild,normal,False,yes
8,sunny,cool,normal,False,yes


In [6]:
df.describe()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
count,14,14,14,14,14
unique,3,3,2,2,2
top,rainy,mild,normal,False,yes
freq,5,6,7,8,9


# Encode target class

In [7]:
df['Class'] = df['Play'].apply(lambda x: 1 if x == 'yes' else -1)
df = df.drop('Play',1)

In [8]:
df.sample(5)

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Class
10,sunny,mild,normal,True,1
13,rainy,mild,high,True,-1
3,rainy,mild,high,False,1
7,sunny,mild,high,False,-1
5,rainy,cool,normal,True,-1


# Encode attributes

## One-hot encoding

In [9]:
target = 'Class'
variables = df.columns[df.columns!=target]

In [14]:
weather_binary = df

for v in variables:
    # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
    ohe = pd.get_dummies(df[v],prefix=v)
    
    # Drop the original column
    weather_binary = weather_binary.drop(v,1)

    # Append to DataFrame 
    for vv in ohe.columns:
        weather_binary[vv] = ohe[vv]

In [19]:
weather_binary.head(5)

Unnamed: 0,Class,Outlook_overcast,Outlook_rainy,Outlook_sunny,Temperature_cool,Temperature_hot,Temperature_mild,Humidity_high,Humidity_normal,Windy_False,Windy_True
0,-1,0,0,1,0,1,0,1,0,1,0
1,-1,0,0,1,0,1,0,1,0,0,1
2,1,1,0,0,0,1,0,1,0,1,0
3,1,0,1,0,0,0,1,1,0,1,0
4,1,0,1,0,1,0,0,0,1,1,0


## Label encoding

In [18]:
from sklearn import preprocessing

weather_dict = {}
numerical_variables = df.columns[df.columns!=target]

# keep all the label encoders used
label_encoders = {}

for v in numerical_variables:
    label_encoders[v] = preprocessing.LabelEncoder()
    label_encoders[v].fit(df[v])
    weather_dict[v] = label_encoders[v].transform(df[v])

weather_numerical = pd.DataFrame(weather_dict)
weather_numerical['Class'] = df['Class']

weather_numerical.head(5)

Unnamed: 0,Humidity,Outlook,Temperature,Windy,Class
0,0,2,1,0,-1
1,0,2,1,1,-1
2,0,0,1,0,1
3,0,1,2,0,1
4,1,1,0,0,1
