### Predict the humidity and display the following results MSE,RMSE

# Import Libraries & Data


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
##Preprocessing Imports

from sklearn import preprocessing

## Visualization Imports
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import seaborn as sns

##Model Building Imports
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

## Metrics Import
from sklearn.metrics import mean_squared_error,max_error,confusion_matrix

## Feature Selection Imports
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Dataset 
> Use the Date Column 'Formatted Date' as the index column

In [None]:
weather_hourly = pd.read_csv('../input/weather-dataset/weatherHistory.csv',
                             index_col=['Formatted Date'],
                             na_values=['9999.99'])
weather_hourly.index = weather_hourly.index.str.replace('\+0200','')
weather_hourly.index=pd.to_datetime(weather_hourly.index,format="%Y-%m-%d %H:%M:%S", utc= True)
weather_hourly.head(5)

# Data Exploration & Visualization

##### Description of the dataset

In [None]:
weather_hourly.describe()

#### Shape of the dataset

In [None]:
weather_hourly.shape

#### Features of the Dataset

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
    
print(f'{color.BOLD} Features of the dataset{color.END}')
print(f'{color.BLUE} {weather_hourly.columns} {color.END}')

print(f'{color.BOLD} Categorical features of the dataset {color.END}')
categorical_features = weather_hourly.select_dtypes(include='object').columns
print(f'{color.BLUE} {categorical_features} {color.END}')
print(f'{color.BOLD} Continous features of the dataset {color.END}')
continous_features = weather_hourly.select_dtypes(exclude='object').columns
print(f' {color.BLUE} {continous_features} {color.END}')



## Plot Hourly Humidity

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(weather_hourly.index.values,
          weather_hourly['Humidity'],
          color='purple')
ax.set(xlabel='Date',ylabel='Humidity',title='Hourly Humidity')
plt.show()

## Correlation matrix for hourly data

In [None]:
#drop the `Loud Cover` column has it has only 0s
weather_hourly_1 = None
weather_hourly_1 = weather_hourly.drop(labels='Loud Cover',axis=1)
weather_hourly_1.corr().style.background_gradient(cmap='Blues')

* Resample Hourly Humidity to Daily Humidity
* Plot Daily Humidity

In [None]:
weather_daily = weather_hourly.resample('D').aggregate('mean')
fig, ax = plt.subplots(figsize=(10,10))

ax.scatter(weather_daily.index.values,
          weather_daily['Humidity'], color='purple')
ax.set(xlabel='Date',ylabel='Humidity',title='Daily Humidity')
plt.show()

## Correlation matrix Weekly Sample

In [None]:
weather_daily_1 = weather_daily.drop(labels='Loud Cover',axis=1)
weather_daily_1.corr().style.background_gradient(cmap='Blues')

* Resample Daily Humidity to Weekly Humidity
* Plot Weekly Humidity

In [None]:
weather_weekly = weather_daily.resample('W').aggregate('mean')
fig, ax = plt.subplots(figsize=(10,10))

ax.scatter(weather_weekly.index.values,
          weather_weekly['Humidity'], color='purple')
ax.set(xlabel='Date',ylabel='Humidity',title='Weekly Humidity')
plt.show()

## Correlation matrix Weekly Sample

In [None]:
weather_weekly_1 = weather_weekly.drop(labels='Loud Cover',axis=1)
weather_weekly_1.corr().style.background_gradient(cmap='Blues')

* Resampe Weekly Humidity to Monthly Humidity
* Plot Monthly Humidity

In [None]:
weather_monthly = weather_weekly.resample('M').aggregate('mean')
fig, ax = plt.subplots(figsize=(10,10))

ax.scatter(weather_monthly.index.values,
          weather_monthly['Humidity'], color='purple')
ax.set(xlabel='Date',ylabel='Humidity',title='Monthly Humidity')
plt.show()

### Correlation Analysis Monthly Sample

In [None]:
weather_monthly.corr().style.background_gradient(cmap="Blues")

##### As we can see the column `Loud Cover` has all zeros , lets drop the columan and then visualize the correlation matrix

In [None]:
weather_monthly_1 = weather_monthly.drop(labels='Loud Cover',axis=1)
weather_monthly_1.corr().style.background_gradient(cmap='Blues')

# Data PreProcessing & Cleanup


### Hourly Dataset

#### Check data quality

In [None]:
weather_hourly.isnull().sum()

> Dataset is pretty clean with only 1 column 'Precip Type' having null values. 
> As part of the 1st iteration, lets drop this feature and proceed. In the later part we will include this feature and check if it helps in giving a better prediction of Humidity

In [None]:
def min_max(input_df):
    max_series = input_df.max(numeric_only=True)
    min_series = input_df.min(numeric_only=True)
    min_max = pd.DataFrame(max_series).transpose().append(pd.DataFrame(min_series).transpose())
    return min_max

min_max(weather_hourly)

In [None]:
weather_hourly[continous_features].columns

### Plotting the distribution

In [None]:
hist = weather_hourly.hist(grid=False,
        legend=False,
        figsize=(15, 8),
        bins=100,
        orientation='horizontal',
        color='blue');


_Steps Performed_
* Drop the Precipe Type column
* Drop the target column __Humidity__
* The numerical ( continous ) features have values in different scale. e.g __Apparent Temperature__ values ranges between *39.34444*  and *-27.7166* while __Wind Speed__ values ranges between *359* and *0*. We use `StandardScalar` to perform normalization
* The categorical columns are label encoded using `LabelEncoder`

In [None]:
class DataWrangler_Approach_A():
    def __init__(self,df):
        self.df = df
        
    def _get_numerical_columns(self):
        return self.df.select_dtypes(include=['float64','int64']).columns
        
    def _get_categorical_columns(self):
        return self.df.select_dtypes(include=['object']).columns
    
    def _remove_columns(self,labels):
        self.df.drop(labels=labels,axis=1,inplace=True)

    
    def _drop_rows_with_null(self):
        self.df.dropna(inplace=True,axis=1)
    
    def _scale_numerical_features(self):
        scaler = preprocessing.StandardScaler()
        numerical_cols = self._get_numerical_columns()
        numerical_df = self.df.loc[:,numerical_cols]
        self.df.drop(labels=numerical_cols, axis=1, inplace=True)
        scaler.fit(numerical_df)
        scaled_cols = scaler.transform(numerical_df)
        self.df[numerical_cols] = scaled_cols
    
    def _label_encode_categories(self):
        le = preprocessing.LabelEncoder()
        categorical_cols = self._get_categorical_columns()
        categorical_df = self.df.loc[:,categorical_cols]
        categorical_df = categorical_df.apply(le.fit_transform)
        self.df[categorical_cols] = categorical_df
        
    
    def perform_wrangling(self):
        #self._remove_columns(['Humidity'])
        self._drop_rows_with_null()
        self._scale_numerical_features()
        self._label_encode_categories()
        dropna=True
        return (self.df,dropna)

In [None]:
raw_df = pd.read_csv('../input/weather-dataset/weatherHistory.csv',
                             index_col=['Formatted Date'],
                             na_values=['9999.99'])
raw_df.index = raw_df.index.str.replace('\+0200','')
raw_df.index=pd.to_datetime(raw_df.index,format="%Y-%m-%d %H:%M:%S", utc= True)
raw_df.head(5)
y = raw_df['Humidity']
raw_df.drop(labels=['Loud Cover'],axis=1,inplace=True)
(pre_processed_df, dropna) = DataWrangler_Approach_A(raw_df).perform_wrangling()
pre_processed_df.head()

In [None]:
pre_processed_df.corr().style.background_gradient(cmap='Blues')

In [None]:
# Drop the target column
pre_processed_df.drop(labels=['Humidity'],axis=1,inplace=True)

In [None]:
def select_feature(feat_sel_func,k,**kwargs):
    fs = SelectKBest(score_func=feat_sel_func,k=k)
    fs.fit(kwargs.get('X_train'),kwargs.get('y_train'))
    X_train_fs = fs.transform(kwargs.get('X_train'))
    X_test_fs = fs.transform(kwargs.get('X_test'))
    print(fs.pvalues_)
    return X_train_fs,X_test_fs,fs

In [None]:
def select_feature_freg(**kwargs):
    feat_count = kwargs['feature_count']
#     X_train = kwargs.get('X_train')
#     X_test = kwargs.get('X_test')
#     print(f' X_train shape = {X_train.shape} X_test shape = {X_test.shape}')

    X_train_fs, X_test_fs, fs = select_feature(f_regression,feat_count,
                                               X_test=kwargs.get('X_test'),
                                               X_train=kwargs.get('X_train'),
                                               y_train=kwargs.get('y_train'))
    mask = fs.get_support(indices=True)
    features = [pre_processed_df.columns[index] for index in mask]
    scores = [fs.scores_[index] for index in mask]
    selected_feature_df = pd.DataFrame(X_train,columns=features)
    if print_score:
        for i in range(len(scores)):
            print(f'Feature {selected_feature_df.columns[i]} score : {fs.scores_[i]}')
    fig, ax = plt.subplots(figsize=(8,4))
    ax.barh([selected_feature_df.columns[i] for i in range(len(features))], scores,height=0.4)
    plt.tight_layout()
    plt.show()
    return X_train_fs, X_test_fs

In [None]:
def select_feature_mutualinfo(**kwargs):
    feat_count=kwargs['feature_count']
    X_train_fs, X_test_fs, fs = select_feature(mutual_info_regression,feat_count,
                                               X_test=kwargs.get('X_test'),
                                               X_train=kwargs.get('X_train'),
                                               y_train=kwargs.get('y_train'))
    mask = fs.get_support(indices=True)
    features = [pre_processed_df.columns[index] for index in mask]
    scores = [fs.scores_[index] for index in mask]
    selected_feature_df = pd.DataFrame(X_train,columns=features)
    if print_score:
        for i in range(len(scores)):
            print(f'Feature {selected_feature_df.columns[i]} score : {fs.scores_[i]}')
    fig, ax = plt.subplots(figsize=(8,4))
    ax.barh([selected_feature_df.columns[i] for i in range(len(features))],scores,height=0.4)
    plt.tight_layout()
    plt.show()
    return X_train_fs,X_test_fs

In [None]:
def perform_linearreg(**kwargs):
    X_train = kwargs['X_train']
    y_train = kwargs['y_train']
    X_test = kwargs['X_test']
    y_test = kwargs['y_test']
    reg = LinearRegression().fit(X_train, y_train)
    score = reg.score(X_test, y_test)
    y_pred = reg.predict(X_test)
    print(f'Training dataset shape: {X_train.shape}')
    print(f'Test dataset shape: {X_test.shape}')
    print(f'{color.BOLD}Accuracy Score {color.END} {score:.4f}')
    print(f'{color.BOLD}Mean squared error:{color.END} {mean_squared_error(y_test, y_pred):.4f}')
    print(f'{color.BOLD}Root Mean squared error:{color.END} {mean_squared_error(y_test, y_pred,squared=False):.4f}')

In [None]:
def perform_ridgereg(**kwargs):
    X_train = kwargs['X_train']
    y_train = kwargs['y_train']
    X_test = kwargs['X_test']
    y_test = kwargs['y_test']
    r_reg = Ridge(alpha=1.0,solver='svd').fit(X_train, y_train)
    score = r_reg.score(X_test, y_test)
    y_pred = r_reg.predict(X_test)
    print(f'Training dataset shape: {X_train.shape}')
    print(f'Test dataset shape: {X_test.shape}')
    print(f'{color.BOLD}Accuracy Score {color.END} {score:.4f}')
    print(f'{color.BOLD}Mean squared error:{color.END} {mean_squared_error(y_test, y_pred):.4f}')
    print(f'{color.BOLD}Root Mean squared error:{color.END} {mean_squared_error(y_test, y_pred,squared=False):.4f}')

In [None]:
def perform_decisiontreereg(**kwargs):
    X_train = kwargs['X_train']
    y_train = kwargs['y_train']
    X_test = kwargs['X_test']
    y_test = kwargs['y_test']
    dtree_reg = DecisionTreeRegressor(random_state=0).fit(X_train, y_train)
    score = dtree_reg.score(X_test, y_test)
    y_pred = dtree_reg.predict(X_test)
    print(f'Training dataset shape: {X_train.shape}')
    print(f'Test dataset shape: {X_test.shape}')
    print(f'{color.BOLD}Accuracy Score {color.END} {score:.4f}')
    print(f'{color.BOLD}Mean squared error:{color.END} {mean_squared_error(y_test, y_pred):.4f}')
    print(f'{color.BOLD}Root Mean squared error:{color.END} {mean_squared_error(y_test, y_pred,squared=False):.4f}')
    return dtree_reg

In [None]:
def perform_randomforestreg(**kwargs):
    X_train = kwargs['X_train']
    y_train = kwargs['y_train']
    X_test = kwargs['X_test']
    y_test = kwargs['y_test']
    random_reg = RandomForestRegressor(n_estimators = 100, random_state=0).fit(X_train, y_train)
    score = random_reg.score(X_test, y_test)
    y_pred = random_reg.predict(X_test)
    print(f'Training dataset shape: {X_train.shape}')
    print(f'Test dataset shape: {X_test.shape}')
    print(f'{color.BOLD}Accuracy Score {color.END} {score:.4f}')
    print(f'{color.BOLD}Mean squared error:{color.END} {mean_squared_error(y_test, y_pred):.4f}')
    print(f'{color.BOLD}Root Mean squared error:{color.END} {mean_squared_error(y_test, y_pred,squared=False):.4f}')
    return random_reg

# Hourly Data Feature Selection

## Feature Selection
* Train Test Split
* Linear Reg with all features
* f_regression feature selection
* Linear regression with K best features
* mutual_info_regression feature selection

In [None]:
X_train, X_test, y_train, y_test = train_test_split(pre_processed_df,y,random_state=42)

freg_feature_count = 6
print_score = False
print(f'{color.BLUE}Select {freg_feature_count} Best features using f_regression{color.END}')
X_train_freg, X_test_freg = select_feature_freg(X_train=X_train,y_train=y_train,
                                            X_test=X_test,feature_count=freg_feature_count,
                                            print_score=print_score)
minfo_feature_count = 6
print_score = False
print(f'{color.BLUE}Select {minfo_feature_count} Best features using mutual_info_regression{color.END}')
X_train_mreg, X_test_mreg = select_feature_mutualinfo(X_train=X_train,y_train=y_train,
                                            X_test=X_test,feature_count=minfo_feature_count,
                                            print_score=print_score)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression, mutual_info_regression

raw_hourly = pd.read_csv('../input/weather-dataset/weatherHistory.csv',
                             index_col=['Formatted Date'],
                             na_values=['9999.99'])
raw_hourly.index = raw_hourly.index.str.replace('\+0200','')
raw_hourly.index=pd.to_datetime(raw_hourly.index,format="%Y-%m-%d %H:%M:%S", utc= True)
raw_hourly.drop(labels=['Summary', 'Precip Type', 'Daily Summary','Loud Cover'],axis=1,inplace=True)
y = raw_hourly['Humidity']
raw_hourly.drop(labels='Humidity',axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(raw_hourly, y,random_state=42)
f_test, _ = f_regression(X_train, y_train)
f_test /= np.max(f_test)

mi = mutual_info_regression(X_train, y_train)
mi /= np.max(mi)

plt.figure(figsize=(25, 15))
for i in range(6):
    plt.subplot(3, 2, i + 1)
    col = X_train.iloc[:,i]
    plt.scatter(col, y_train, edgecolor='black', s=20)
    #plt.xlabel("$x_{}$".format(i + 1), fontsize=14)
    plt.xlabel(f'{col[0:0].name}', fontsize=14)
    if not i%2 > 0:
        plt.ylabel("$y$", fontsize=14)
#     plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]),
#               fontsize=16)
    plt.title("F-test={:.2f}".format(f_test[i]),
              fontsize=16)
    
plt.tight_layout()
plt.show()

# Hourly Data Model Building

### Linear Regression

In [None]:
l_reg_allfeat = perform_linearreg(X_train=X_train,X_test=X_test,y_train=y_train,y_test=y_test)
l_reg_freg = perform_linearreg(X_train=X_train_freg,X_test=X_test_freg,y_train=y_train,y_test=y_test)
l_reg_minfo = perform_linearreg(X_train=X_train_mreg,X_test=X_test_mreg,y_train=y_train,y_test=y_test)


### Random Forest Regression

In [None]:
print(f'{color.BLUE} RandomForest Regression using all features {color.END}')
random_reg_allfeat = perform_randomforestreg(X_train=X_train,X_test=X_test,
                                             y_train=y_train,y_test=y_test)
print(f'{color.BLUE} RandomForest Regression using 6 top features selected using `f_regression` {color.END}')
random_reg_freg = perform_randomforestreg(X_train=X_train_freg,X_test=X_test_freg,
                              y_train=y_train,y_test=y_test)
print(f'{color.BLUE} RandomForest Regression using 6 top features selected using `mutual_info_regression` {color.END}')
random_reg_minfo = perform_randomforestreg(X_train=X_train_mreg,X_test=X_test_mreg,
                                y_train=y_train,y_test=y_test)


### DecisionTree Regression

In [None]:
print(f'{color.BLUE} DecisionTree Regression using all features {color.END}')
dtree_reg_allfeat = perform_decisiontreereg(X_train=X_train,
                                            X_test=X_test,
                                             y_train=y_train,
                                            y_test=y_test)
print(dtree_reg_allfeat.feature_importances_)
print(f'{color.BLUE} DecisionTree Regression using 6 top features selected using `f_regression` {color.END}')
dtree_reg_freg = perform_decisiontreereg(X_train=X_train_freg,
                                         X_test=X_test_freg,
                                         y_train=y_train,
                                         y_test=y_test)
print(f'{color.BLUE} DecisionTree Regression using 6 top features selected using `mutual_info_regression` {color.END}')
dtree_reg_minfo = perform_decisiontreereg(X_train=X_train_mreg,
                                          X_test=X_test_mreg,
                                          y_train=y_train,
                                          y_test=y_test)


# Daily Data Feature Selection

In [None]:
y_daily = weather_daily['Humidity']
(pre_processed_daily_df, dropna) = DataWrangler_Approach_A(weather_daily).perform_wrangling()
pre_processed_daily_df.head()

## Feature Selection

In [None]:
X_train_daily, X_test_daily, y_train_daily, y_test_daily = train_test_split(
    pre_processed_daily_df,
    y_daily,random_state=42)
print(f'Shape of input X_train = {X_train_daily.shape} y_train = {y_train_daily.shape} X_test = {X_test_daily.shape} y_test = {y_test_daily.shape}')
freg_feature_count_daily = 6
print_score_daily = False
print(f'{color.BLUE}Select {freg_feature_count} Best features using f_regression{color.END}')
X_train_freg_daily, X_test_freg_daily = select_feature_freg(X_train=X_train_daily,
                                                            y_train=y_train_daily,
                                                            X_test=X_test_daily,
                                                            feature_count=freg_feature_count_daily,
                                                            print_score=print_score_daily)

print(f'Shape of output X_train = {X_train_freg_daily.shape} y_train = {y_train_daily.shape} X_test = {X_test_freg_daily.shape} y_test = {y_test_daily.shape}')
minfo_feature_count_daily = 6
print_score_daily = False
print(f'{color.BLUE}Select {minfo_feature_count} Best features using mutual_info_regression{color.END}')
X_train_mreg_daily, X_test_mreg_daily = select_feature_mutualinfo(X_train=X_train_daily,
                                                                  y_train=y_train_daily,
                                                                  X_test=X_test_daily,
                                                                  feature_count=freg_feature_count_daily,
                                                                  print_score=print_score_daily)
print(f'Shape of output X_train = {X_train_mreg_daily.shape} y_train = {y_train_daily.shape} X_test = {X_test_mreg_daily.shape} y_test = {y_test_daily.shape}')

# Daily Data Model Building

#### Linear Regression

In [None]:

l_reg_daily_allfeat = perform_linearreg(X_train=X_train_daily,
                                        X_test=X_test_daily,
                                        y_train=y_train_daily,
                                        y_test=y_test_daily)

l_reg_daily_freg = perform_linearreg(X_train=X_train_freg_daily,
                                     X_test=X_test_freg_daily,
                                     y_train=y_train_daily,
                                     y_test=y_test_daily)
l_reg_daily_minfo = perform_linearreg(X_train=X_train_mreg_daily,
                                      X_test=X_test_mreg_daily,
                                      y_train=y_train_daily,
                                      y_test=y_test_daily)


#### Random Forest Regression

In [None]:
print(f'{color.BLUE} RandomForest Regression using all features {color.END}')
random_reg_allfeat = perform_randomforestreg(X_train=X_train_daily,
                                        X_test=X_test_daily,
                                        y_train=y_train_daily,
                                        y_test=y_test_daily)
print(f'{color.BLUE} RandomForest Regression using 6 top features selected using `f_regression` {color.END}')
random_reg_freg = perform_randomforestreg(X_train=X_train_freg_daily,
                                     X_test=X_test_freg_daily,
                                     y_train=y_train_daily,
                                     y_test=y_test_daily)
print(f'{color.BLUE} RandomForest Regression using 6 top features selected using `mutual_info_regression` {color.END}')
random_reg_minfo = perform_randomforestreg(X_train=X_train_mreg_daily,
                                      X_test=X_test_mreg_daily,
                                      y_train=y_train_daily,
                                      y_test=y_test_daily)


**From the above experiment it is evident that resampling the data to daily dataset, we are loosing a lot of information and hence the model accuracy goes down to zero, which makes the model unsable**