In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sb

%matplotlib inline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Loading the data

In [None]:
data = pd.read_csv('../input/AirQualityUCI_req.csv')
data.index = pd.DatetimeIndex(data.Date, dayfirst=True).strftime('%Y-%m-%d')
data = data.drop(['Date' ], 1)
cols = data.columns
data = data[data[cols] > 0]
data = data.fillna(method='ffill')
data.head()

# Getting the temperature values.

In [None]:
temperature = data[['T']]
temperature_by_day = temperature.groupby(temperature.index).mean()
t_values = temperature_by_day.values
t_values = t_values.reshape(-1)
len(t_values)

> **Coining Time series problem as a classification problem**

- We try to predict the trend of the data based on the values. We only predict the direction of temperature based on the history of data points.
- We convert the dataset to classes as 1 for upward trend and 0 for downward trend.
- We can select the window of the data that is chosen. Tuning the window size changes the length of history that is considered as features.

In [None]:
def make_data(data_array, window=7):
    col = ['t'+str(i) for i in range(window)]
    col.append('Class')
    dict_data = {}
    count = 0
    inc_count = 0
    dec_count = 0
    for i in range(len(data_array)-window):
        if data_array[i+window] >= data_array[i+window-1]:
            temp_class = int(1)
            inc_count += 1
        else:
            temp_class = int(0)
            dec_count += 1
        #print(temp_class)
        count = count + 1
        temp_data = data_array[i:i+window]
        temp_row = np.concatenate((temp_data, temp_class), axis=None)
        dict_data[i] = temp_row
    #print(data)
    print(count, inc_count, dec_count)
    dataframe = pd.DataFrame.from_dict(dict_data,orient='index', columns=col)
    return dataframe, col[:len(col)-1]


In [None]:
transformed_data, feature_cols = make_data(t_values)
transformed_data.head()

> ** Using this dataframe to create features and classes.**

In [None]:
features = transformed_data[feature_cols].values
classes = transformed_data[['Class']].values
print(features.shape, classes.shape)

> This notebook approaches the problem in two ways :
- Without Dimensionality Reduction
- With Dimensionality Reduction

We compare different classifiers in each approach.

**Wihtout Dimensionality Reduction**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import itertools

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(features,classes, test_size=0.33, random_state=42)
Y_train = Y_train.reshape(-1)
Y_test = Y_test.reshape(-1)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

> Logistic Regression

In [None]:
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)

y_pred = model.predict(X_test)
cm_lr = confusion_matrix(Y_test, y_pred)
acc_lr = accuracy_score(Y_test, y_pred)
print("Accuracy of classification : {0} %".format(acc_lr*100))
print(cm_lr)

>  Multi Layer Perceptron

In [None]:
model = MLPClassifier(hidden_layer_sizes=(90), learning_rate='constant')
model.fit(X_train, Y_train)

y_pred = model.predict(X_test)
cm_mlp = confusion_matrix(Y_test, y_pred)
acc_mlp = accuracy_score(Y_test, y_pred)
print("Accuracy of classification : {0} %".format(acc_mlp*100))
print(cm_mlp)

> Naive Bayes Classifier

In [None]:
NB_model = GaussianNB()
NB_model.fit(X_train, Y_train)

y_pred = NB_model.predict(X_test)
acc_nb = accuracy_score(Y_test, y_pred)
cm_nb = confusion_matrix(Y_test, y_pred)
print("Accuracy of classification : {0} %".format(acc_nb*100))
print(cm_nb)

> Decision Tree Classifier

In [None]:
DT_model = DecisionTreeClassifier()
DT_model.fit(X_train,Y_train)

y_pred = DT_model.predict(X_test)
acc_dt = accuracy_score(Y_test, y_pred)
cm_dt = confusion_matrix(Y_test, y_pred)
print("Accuracy of classification : {0} %".format(acc_dt*100))
print(cm_dt)

In [None]:
sb.barplot(x=[acc_lr,acc_mlp,acc_nb,acc_nb],y=['Logistic Regression', 'MLP', 'Naive Bayes', 'Decision Tree'])
plt.title('Without Dimensionality Reduction')
plt.xlabel('Accuracy in %')
plt.ylabel('Classifiers')

**With Dimensionality Reduction**

> We can use two types of Embedding techniques:
-  t-SNE
- Multi Dimensional Spacing

In [None]:
from sklearn.manifold import TSNE, MDS

In [None]:
features_embedded_tsne = TSNE(n_components=2).fit_transform(features)
plt.scatter(features_embedded_tsne[:,0], features_embedded_tsne[:,1],c=classes.reshape(-1))

In [None]:
features_embedded_mds = MDS(n_components=2).fit_transform(features)
plt.scatter(features_embedded_mds[:,0], features_embedded_mds[:,1],c=classes.reshape(-1))

We will use MDS embedding for classification due to the distribution of the data across x and y axes.

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(features_embedded_mds,classes, test_size=0.33, random_state=42)
Y_train = Y_train.reshape(-1)
Y_test = Y_test.reshape(-1)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

> Logistic Regression

In [None]:
LRmodel = LogisticRegression(solver='liblinear')
LRmodel.fit(X_train,Y_train)
y_pred = LRmodel.predict(X_test)
cm_lr1 = confusion_matrix(Y_test, y_pred)
acc_lr1 = accuracy_score(Y_test, y_pred)
print("Accuracy of classification : {0} %".format(acc_lr1*100))
print(cm_lr1)

> Multi Layer Perceptron

In [None]:
MLPmodel = MLPClassifier(hidden_layer_sizes=(90), learning_rate='constant')
MLPmodel.fit(X_train, Y_train)
y_pred = MLPmodel.predict(X_test)
cm_mlp1 = confusion_matrix(Y_test, y_pred)
acc_mlp1 = accuracy_score(Y_test, y_pred)
print("Accuracy of classification : {0} %".format(acc_mlp1*100))
print(cm_mlp1)

> Naive Bayes Classifier

In [None]:
NB_model = GaussianNB()
NB_model.fit(X_train, Y_train)

y_pred = NB_model.predict(X_test)
acc_nb1 = accuracy_score(Y_test, y_pred)
cm_nb1 = confusion_matrix(Y_test, y_pred)
print("Accuracy of classification : {0} %".format(acc_nb1*100))
print(cm_nb1)

> Decision Tree Classifier

In [None]:
DT_model = DecisionTreeClassifier()
DT_model.fit(X_train,Y_train)

y_pred = DT_model.predict(X_test)
acc_dt1 = accuracy_score(Y_test, y_pred)
cm_dt1 = confusion_matrix(Y_test, y_pred)
print("Accuracy of classification : {0} %".format(acc_dt1*100))
print(cm_dt1)

> Comparing the models 

In [None]:
sb.barplot(x=[acc_lr1,acc_mlp1, acc_nb1, acc_nb1], y= ['Logistic Regression', 'MLP', 'Naive Bayes', 'Decision Tree'])
plt.title('With Dimensionality Reduction')
plt.xlabel('Accuracy in %')
plt.ylabel('Classifiers')

> Comapring each model with and without Dimensionality reduction

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1,5,1)
sb.barplot(x=['With MDS', 'Without MDS'], y = [acc_lr1,acc_lr])
plt.title('Logistic Regression')

plt.subplot(1,5,2)
sb.barplot(x=['With MDS', 'Without MDS'], y = [acc_mlp1,acc_mlp])
plt.title('MLP')

plt.subplot(1,5,3)
sb.barplot(x=['With MDS', 'Without MDS'], y = [acc_nb1,acc_nb])
plt.title('Naive Bayes')

plt.subplot(1,5,4)
sb.barplot(x=['With MDS', 'Without MDS'], y = [acc_dt1, acc_dt])
plt.title('Decision Tree')

> From the above plots, it is clear that the dimensionality reduction has very little impact on the classification.
- Moreover, the classifiers are performing worse than a random guess of the trend. This is becuase of the availability of limited data.
- Further, this approach can be tested with more amount of the data to get better results.

*All values in the plots provided are averaged over various runs of the classifiers.*