In [48]:
# importing packages

import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score, GridSearchCV

%matplotlib inline

In [4]:
# loading and reading dataset

features = pd.read_csv("../datasets/data_X.csv")
features["date_time"] = pd.to_datetime(features["date_time"])
features.head()

Unnamed: 0,date_time,T_data_1_1,T_data_1_2,T_data_1_3,T_data_2_1,T_data_2_2,T_data_2_3,T_data_3_1,T_data_3_2,T_data_3_3,T_data_4_1,T_data_4_2,T_data_4_3,T_data_5_1,T_data_5_2,T_data_5_3,H_data,AH_data
0,2015-01-01 00:00:00,212,210,211,347,353,347,474,473,481,346,348,355,241,241,243,167.85,9.22
1,2015-01-01 00:01:00,212,211,211,346,352,346,475,473,481,349,348,355,241,241,243,162.51,9.22
2,2015-01-01 00:02:00,212,211,211,345,352,346,476,473,481,352,349,355,242,241,242,164.99,9.22
3,2015-01-01 00:03:00,213,211,211,344,351,346,477,473,481,355,349,355,242,241,242,167.34,9.22
4,2015-01-01 00:04:00,213,211,211,343,350,346,478,473,482,358,349,355,243,241,242,163.04,9.22


In [5]:
# target data

target = pd.read_csv("../datasets/data_Y.csv", parse_dates=True)
target.head()

Unnamed: 0,date_time,quality
0,2015-01-04 00:05:00,392
1,2015-01-04 01:05:00,384
2,2015-01-04 02:05:00,393
3,2015-01-04 03:05:00,399
4,2015-01-04 04:05:00,400


In [6]:
# the features shapes and the target shapes mismatch 
# the features have records of every minute whereas the target data has records of every hour
# we will transpose the data in such a way that we will use the minutes data as features in out data set 
# that way we will have features data with the interval of every hour.
print(f" Feature Shape: {features.shape}, Target Shape: {target.shape}")

 Feature Shape: (2103841, 18), Target Shape: (29184, 2)


In [13]:
# extract hour data from the date_time column we are keeping the month year and day so that the hour data stays unique and does not repeat.
# we will use the hour data to capture the records taken every minute so that we can transpose it and use it as features.
features["hour_data"] = features["date_time"].apply(lambda x: x.strftime("%Y-%m-%d-%H"))

In [7]:
def create_hourly_processing_features(features: pd.DataFrame) -> pd.DataFrame:
    training_data = pd.DataFrame() # initializing the training dataframe

    # refrenced from https://www.kaggle.com/alexkaggle95/production-quality-prediction-mae-6-954

    # iterating through every hour in hour data
    for hour in tqdm(features.hour_data.unique()):
        hour_data = features.loc[features.hour_data == hour] # getting the features set of the current hour
        ah = list(hour_data["AH_data"].unique())[0] # getting AH_data since it will be unique throuhgout the hour.
        hour_data = hour_data.iloc[:,1:] # slicing the hour data and removing the date_time feature

        # reindexing the data based on hour_data and AH_data
        # stackig the 60x16 data into 960,1 dataframe  
        hour_data = pd.DataFrame(hour_data.set_index(["hour_data", "AH_data"]).stack())
        hour_data = hour_data.reset_index() # now reset the index

        # after transposing the data there are two rows level_2 and 0 
        # level_2 contains columns names which are not useful as a row for the data so we will drop that 
        hour_data = hour_data[["level_2", 0]].T.drop('level_2')
        hour_data["hour_data"] = hour # here we will assign the hour
        hour_data["AH_data"] = ah # here we will assig the AH_data

        training_data = pd.concat([training_data, hour_data]) # after the row is created we can add it the dataframe initialized at the beginning
        
    return training_data

In [7]:
# store the preprocessed_features
# this will take a long time depending on your machine
processed_features = create_hourly_processing_features(features)
print(processed_features)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35065/35065 [7:46:43<00:00,  1.25it/s]

        0      1      2      3      4      5      6      7      8      9  ...  \
0   212.0  210.0  211.0  347.0  353.0  347.0  474.0  473.0  481.0  346.0  ...   
0   210.0  213.0  212.0  299.0  300.0  346.0  523.0  473.0  490.0  330.0  ...   
0   213.0  193.0  212.0  285.0  262.0  363.0  527.0  481.0  490.0  366.0  ...   
0   248.0  207.0  212.0  303.0  271.0  331.0  486.0  488.0  482.0  347.0  ...   
0   232.0  229.0  239.0  355.0  152.0  349.0  433.0  467.0  481.0  333.0  ...   
..    ...    ...    ...    ...    ...    ...    ...    ...    ...    ...  ...   
0   264.0  260.0  257.0  377.0  363.0  378.0  452.0  449.0  447.0  334.0  ...   
0   257.0  274.0  255.0  373.0  368.0  365.0  482.0  449.0  484.0  328.0  ...   
0   322.0  286.0  256.0  352.0  343.0  349.0  481.0  449.0  496.0  325.0  ...   
0   271.0  252.0  262.0  357.0  359.0  360.0  481.0  449.0  491.0  325.0  ...   
0   271.0  261.0  265.0  353.0  359.0  353.0  481.0  449.0  491.0  325.0  ...   

      952    953    954    




In [8]:
# save the preprocessed features in a csv file for a later use
new_features = pd.read_csv("preprocessed_features.csv")
new_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,952,953,954,955,956,957,958,959,hour_data,AH_data
0,212.0,210.0,211.0,347.0,353.0,347.0,474.0,473.0,481.0,346.0,...,490.0,331.0,356.0,346.0,244.0,240.0,238.0,164.3,2015-01-01-00,9.22
1,210.0,213.0,212.0,299.0,300.0,346.0,523.0,473.0,490.0,330.0,...,490.0,366.0,351.0,342.0,239.0,237.0,238.0,163.35,2015-01-01-01,7.82
2,213.0,193.0,212.0,285.0,262.0,363.0,527.0,481.0,490.0,366.0,...,483.0,347.0,344.0,346.0,235.0,235.0,238.0,165.59,2015-01-01-02,6.03
3,248.0,207.0,212.0,303.0,271.0,331.0,486.0,488.0,482.0,347.0,...,481.0,333.0,361.0,354.0,239.0,241.0,238.0,164.89,2015-01-01-03,8.36
4,232.0,229.0,239.0,355.0,152.0,349.0,433.0,467.0,481.0,333.0,...,486.0,340.0,342.0,353.0,238.0,246.0,240.0,164.68,2015-01-01-04,7.02


In [9]:
new_features.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,952,953,954,955,956,957,958,959,hour_data,AH_data
35060,264.0,260.0,257.0,377.0,363.0,378.0,452.0,449.0,447.0,334.0,...,484.0,328.0,321.0,323.0,277.0,280.0,278.0,186.07,2018-12-31-20,6.38
35061,257.0,274.0,255.0,373.0,368.0,365.0,482.0,449.0,484.0,328.0,...,496.0,325.0,327.0,326.0,277.0,277.0,280.0,164.91,2018-12-31-21,8.37
35062,322.0,286.0,256.0,352.0,343.0,349.0,481.0,449.0,496.0,325.0,...,491.0,325.0,332.0,328.0,277.0,276.0,280.0,158.21,2018-12-31-22,6.13
35063,271.0,252.0,262.0,357.0,359.0,360.0,481.0,449.0,491.0,325.0,...,491.0,325.0,328.0,328.0,277.0,276.0,280.0,162.96,2018-12-31-23,8.44
35064,271.0,261.0,265.0,353.0,359.0,353.0,481.0,449.0,491.0,325.0,...,,,,,,,,,2019-01-01-00,7.35


In [10]:
new_features = new_features.dropna() # droppig the NaN values

In [17]:
target['train_date'] = pd.to_datetime(target['date_time']) - datetime.timedelta(minutes=5) # shifting the time 5 mins back
target['train_date'] =  pd.to_datetime(target['train_date']) 
target['train_date'] = target['train_date'].apply(lambda x: x.strftime("%d-%m-%Y-%H")) # reformattig the datetime

In [18]:
target.shape

(29184, 3)

In [21]:
new_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,952,953,954,955,956,957,958,959,hour_data,AH_data
0,212.0,210.0,211.0,347.0,353.0,347.0,474.0,473.0,481.0,346.0,...,490.0,331.0,356.0,346.0,244.0,240.0,238.0,164.3,2015-01-01-00,9.22
1,210.0,213.0,212.0,299.0,300.0,346.0,523.0,473.0,490.0,330.0,...,490.0,366.0,351.0,342.0,239.0,237.0,238.0,163.35,2015-01-01-01,7.82
2,213.0,193.0,212.0,285.0,262.0,363.0,527.0,481.0,490.0,366.0,...,483.0,347.0,344.0,346.0,235.0,235.0,238.0,165.59,2015-01-01-02,6.03
3,248.0,207.0,212.0,303.0,271.0,331.0,486.0,488.0,482.0,347.0,...,481.0,333.0,361.0,354.0,239.0,241.0,238.0,164.89,2015-01-01-03,8.36
4,232.0,229.0,239.0,355.0,152.0,349.0,433.0,467.0,481.0,333.0,...,486.0,340.0,342.0,353.0,238.0,246.0,240.0,164.68,2015-01-01-04,7.02


In [23]:
df = new_features.join(target, how='inner') # merging the features and target data

In [24]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,955,956,957,958,959,hour_data,AH_data,date_time,quality,train_date
0,212.0,210.0,211.0,347.0,353.0,347.0,474.0,473.0,481.0,346.0,...,346.0,244.0,240.0,238.0,164.3,2015-01-01-00,9.22,2015-01-04 00:05:00,392,04-01-2015-00
1,210.0,213.0,212.0,299.0,300.0,346.0,523.0,473.0,490.0,330.0,...,342.0,239.0,237.0,238.0,163.35,2015-01-01-01,7.82,2015-01-04 01:05:00,384,04-01-2015-01
2,213.0,193.0,212.0,285.0,262.0,363.0,527.0,481.0,490.0,366.0,...,346.0,235.0,235.0,238.0,165.59,2015-01-01-02,6.03,2015-01-04 02:05:00,393,04-01-2015-02
3,248.0,207.0,212.0,303.0,271.0,331.0,486.0,488.0,482.0,347.0,...,354.0,239.0,241.0,238.0,164.89,2015-01-01-03,8.36,2015-01-04 03:05:00,399,04-01-2015-03
4,232.0,229.0,239.0,355.0,152.0,349.0,433.0,467.0,481.0,333.0,...,353.0,238.0,246.0,240.0,164.68,2015-01-01-04,7.02,2015-01-04 04:05:00,400,04-01-2015-04


In [25]:
df = df.drop(['hour_data', 'date_time','train_date'], axis=1) # drop the date features

In [26]:
# split the features and target

features = df.iloc[:, :-1]
target = df.iloc[:,-1]

In [27]:
ss = StandardScaler() # initialize the standard scaler class
scaled_features = ss.fit_transform(features) # scale the features
pca = PCA(random_state=42, n_components=60) # initialize a PCA class with 60 componets
decomposed_features = pca.fit_transform(scaled_features) # fit the PCA model in the features and transform

In [28]:
decomposed_features.shape

(29184, 60)

In [29]:
pca.n_components_

60

In [30]:
pca.explained_variance_ratio_

array([1.46247076e-01, 1.39350519e-01, 1.27672909e-01, 1.09703913e-01,
       1.04819577e-01, 5.71637567e-02, 3.66155780e-02, 3.39659607e-02,
       3.27575057e-02, 3.22669340e-02, 2.66549538e-02, 2.43521323e-02,
       1.99667575e-02, 1.88599313e-02, 1.85699686e-02, 1.71901831e-02,
       3.90204786e-03, 3.75948497e-03, 3.43489192e-03, 3.35896611e-03,
       3.30495889e-03, 3.14434037e-03, 2.97421104e-03, 2.70492486e-03,
       2.66239653e-03, 2.62456419e-03, 2.53778939e-03, 2.39264274e-03,
       1.94795813e-03, 1.93061210e-03, 1.86292467e-03, 1.80679518e-03,
       1.03994206e-03, 6.35365834e-04, 4.68723237e-04, 3.97358663e-04,
       3.68615274e-04, 3.66575795e-04, 3.52676712e-04, 3.43636234e-04,
       3.23051061e-04, 2.96454228e-04, 2.89286818e-04, 2.85614267e-04,
       2.55933940e-04, 2.42085388e-04, 2.26047310e-04, 2.14817895e-04,
       2.10602147e-04, 2.06914935e-04, 1.85713391e-04, 1.28445100e-04,
       9.67241448e-05, 7.26796522e-05, 7.04201766e-05, 6.67222766e-05,
      

In [31]:
decomposed_features.shape, target.shape # now the decomposed features have new shapes   

((29184, 60), (29184,))

In [32]:
X_train, X_test, y_train, y_test = train_test_split(decomposed_features, target, test_size=0.25, random_state=42) # split the data into training and testing data

In [33]:
X_train.shape, X_test.shape, y_test.shape, y_test.shape

((21888, 60), (7296, 60), (7296,), (7296,))

In [34]:
lr = LinearRegression() # initialize the linear regression class
lr.fit(X_train, y_train) # fit the model into lr class

LinearRegression()

In [94]:
train_preds = lr.predict(X_train) # predict for train data
test_preds = lr.predict(X_test) # predict for test data

In [95]:
train_msr_score = mean_squared_error(y_train, train_preds) 
test_msr_score = mean_squared_error(y_test, test_preds)

In [96]:
print(f'mean squared error for train data: {train_msr_score} and test data: {test_msr_score}')

mean squared error for train data: 2122.4042191669328 and test data: 2187.1524155610377


In [97]:
train_mae_score = mean_absolute_error(y_train, train_preds)
test_mae_score = mean_absolute_error(y_test, test_preds)

In [98]:
print(f'mean absolute error for train data: {train_mae_score} and test data: {test_mae_score}')

mean absolute error for train data: 37.65547461409968 and test data: 38.108767623636076


In [46]:
cv_score = cross_val_score(lr, X_train, y_train, cv=5)

In [47]:
print(cv_score)

[-0.00200626 -0.00341348 -0.0043438  -0.00495611 -0.00527662]
