In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Visualising The Data Structure

The first task in any machine learning pipeline is to take a quick look at the data.

In [None]:
dirname = "/kaggle/input/weather-archive-jena"

filename = "jena_climate_2009_2016.csv"

filename = os.path.join(dirname, filename)

jena = pd.read_csv(filename)

jena.info()

In [None]:
jena.describe()

In [None]:
jena.head(10)

Right of the bat the data looks quite interesting. It seems that it has been recorded every 10 minutes starting from Jan 01. 2009. The definitions of the attributes are as follows:

1. p - Air pressure (SI: bar)
2. T - Air Temperature (SI: Celsius)
3. Tpot - Air Temperature (SI: Kelvin (+273.42 K))
4. rh - relative humidity
5. VPmax, VPact, VPdef - Vapor pressure (maximum, actual, definite(?))
6. sh - No idea!
7. H2OC - Water concentration or humidity 
8. rho - Air density (SI - g/m**3)
9. wv, maxwv - Wind velocity (average, maximum) (SI - m/s)
10. wd - Wind direction (SI - Deg)

Additionally there is no missing data.


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

jena.hist(bins=50, figsize=(20, 15))
plt.show()

### Initial remarks

* The data seems to be very tightly packed for most of the attributes and behaves very smoothly. 

* For the case of the wind velocity wv, we see that in both the max and average case, that the data is clustered around 0 and spread out upto -10000 m/s. We need to see if this is a data anamoly, specifically, a special consideration must be made for this measurement at a later stage.

* The data is also quite tail heavy in some cases for example in case of the relative humidity.

# Problem Statement

We create a fictive problem statement to work with the dataset: 

**What effect do the various attributes in the Jena data have on the temperature? Or in other words, is it possible to predict the temperature once we know the remaining attributes?**

### Create a test set

In order to avoid a bias due to data snooping, the first task is to create a test and training set. 

We are going to assume that the sample set is large enough and we can split the set simply by using random sampling.

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(jena, test_size=0.2, random_state=42)

## Visualising the Data

Now that we have our split the data, we are going to further visualise the data in order to get an insight into the data.

In [None]:
jena = train_set.copy()

In [None]:
jena["Date Time"].value_counts()

The time of the day would have an effect on the temperature. However, it turns out that the data is not unique. There are several date where two measurements are available.

Options:

1. If there are not too many duplicates, delete the data and fill in the median
2. Delete the date time data.

It would be sensible to see if the time attribute actually does have any effect on the temperature. If this is the case, then we will clean the data accordingly. 

#### Additional insights:

1. Additionally, it is expected that the month of the year will also have an effect on the temperature. 
2. It would be interesting to see if the temperature does increase with each passing year. 

In [None]:
from datetime import datetime

jena["Date Time"] = jena["Date Time"].astype("datetime64[s]")


In [None]:
jena["Hour"] = jena["Date Time"].dt.hour
jena["Year"] = jena["Date Time"].dt.year
jena["Month"] = jena["Date Time"].dt.month
jena["Day_of_Year"] = jena["Date Time"].dt.dayofyear

jena.info()


In [None]:
jena.describe()

In [None]:
correlation_matrix = jena.corr()
correlation_matrix["T (degC)"].sort_values(ascending=False)

* Since T and Tpot are basically the same, they have a strong corelation.

* Interestingly, the month of the year, the year and hour do not show strong coorelations. They are nonetheless positive coorelations.

In [None]:
from pandas.plotting import scatter_matrix

selected_attributes = ["T (degC)", "VPmax (mbar)", "Month", "rho (g/m**3)"]
scatter_matrix(jena[selected_attributes], figsize=(15, 10))

## Creating new attributes

The following data may be interesting:

1. Seasons of the year


In [None]:
# seasons encoding
# 0 - winter, 1 - spring, 2 - summer, 3 - autumn
jena["Season"] = ((jena["Month"]%12 + 3)//3) - 1
#jena.head(25)
correlation_matrix1 = jena.corr()
correlation_matrix1["T (degC)"].sort_values(ascending=False)

We are able to find a positive correlation to the seasons of the year. Specifically, the temperature increases we move from winter (0) to autumn (3)

## Preparing the data

Since the data visualisation step has been completed, we will now create a pipeline for datacleaning. 
The data can then "directly" be fed into the ML algorithm.

## Custom Transformer for DateTimeData

Since no strong correlations are seen to the date-time data, we will simply create a season transformation to account for seasons of the year. Finally we can use the one-hot encoding on the season data

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

In [None]:
# Separating features from labels
jena_train = train_set.drop("T (degC)", axis=1).copy()
jena_labels_train = train_set["T (degC)"].copy()

In [None]:
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion

# Create a categorical pipeline

cat_attribs = ["Season"]

jena_train["Season"] = (jena_train["Date Time"].astype("datetime64[s]").dt.month % 12 + 3)//3 - 1

cat_pipeline = Pipeline([
                        ("selector", DataFrameSelector(cat_attribs)),
                        ("cat-encoder", OneHotEncoder())
                        ])

# creating a numerical pipeline

num_attribs = list(jena_train.drop("Date Time", axis=1).copy())

num_pipeline = Pipeline([
                            ("selector", DataFrameSelector(num_attribs)),
                            ("feature-scaling", StandardScaler())
                        ])

full_pipeline = FeatureUnion(transformer_list = [
                            ("num_pipeline", num_pipeline),
                            ("cat_pipeline", cat_pipeline)
                        ])
# transformed dataset
jena_train_prepared = full_pipeline.fit_transform(jena_train)

## Training and Evaluating

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(jena_train_prepared, jena_labels_train)

In [None]:
# Evaluating

some_data = jena_train.iloc[:5]
some_labels = jena_labels_train[:5]

some_data_prepared = full_pipeline.transform(some_data)
print("Predictions: ", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

The model performs shockingly good.

In [None]:
from sklearn.metrics import mean_squared_error

jena_predictions = lin_reg.predict(jena_train_prepared)
lin_mse = mean_squared_error(jena_labels_train, jena_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse