# Use case: Weather in Szeged 2006-2016
- The dataset is related with Weather in Szeged 2006-2016. The goal is to predict apparent temperature for the given humidity. 
<!-- And Examine Is there a relationship between humidity-temperature and humidity-apparent temperature. -->

In [None]:
# Import required modules into program
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Read csv file into Dataframe
df = pd.read_csv("/kaggle/input/szeged-weather/weatherHistory.csv")
print(f"df shape:\t {df.shape}")

In [None]:
df.head()

## Drop Duplicate Values

In [None]:
# Drop duplicate values
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df.shape

## Creating new  DF with required features

In [None]:
df2 = df[['Temperature (C)', 'Apparent Temperature (C)', 'Humidity']]
df2.head()

In [None]:
# Checking df structure 
print(f"df2 shape:\t {df2.shape}\n")
df2.info()

In [None]:
df2.describe(include='all')

## Finding Missing Values

In [None]:
features_na = [features for features in df2.columns if df2[features].isnull().sum() > 0]
if(len(features_na)>0):
    for feature in features_na:
        print("{}: {} %".format(feature, np.round(df2[feature].isnull().mean()*100, 4)))
else:
    print("No any missing value found")

## Find Outliers

In [None]:
# Boxplot Numerical Features
numerical_features = [feature for feature in df2.columns if (df2[feature].dtypes != 'O')]

plt.figure(figsize=(15,45), facecolor='black')
plotnumber =1
for numerical_feature in numerical_features:
    ax = plt.subplot(12, 1, plotnumber)
    sns.boxplot(df2[numerical_feature])
    plt.xlabel(numerical_feature, color="white", size=15)
    plotnumber += 1
plt.show()

## Remove Outliers

In [None]:
df3 = df2.copy()

In [None]:
def draw_boxplots(feature,min_val,max_val):
    fig, axes = plt.subplots(1,2)
    plt.tight_layout(0.2)
    plt.figure(figsize=(15,10), facecolor='black')
    sns.boxplot(df2[feature],orient='h',ax=axes[0])
    axes[0].title.set_text("Before")
    sns.boxplot(df3[feature],orient='h',ax=axes[1])
    axes[1].title.set_text("After")
    plt.show()

In [None]:
feature = 'Humidity'
min_val = 0.2
max_val = 1

df3 = df3[(df3[feature]>min_val) & (df3[feature]<max_val)]
print('Shape: ',df3.shape)
draw_boxplots(feature, min_val, max_val)

In [None]:
feature = 'Temperature (C)'
min_val = -10
max_val = 40
df3 = df3[(df3[feature]>min_val) & (df3[feature]<max_val)]
print('Shape: ',df3.shape)
draw_boxplots(feature, min_val, max_val)

In [None]:
# feature = 'Apparent Temperature (C)'
# min_val = -10
# max_val = 40
# df3 = df3[(df3[feature]>min_val) & (df3[feature]<max_val)]
# print('Shape: ',df3.shape)
# draw_boxplots(feature, min_val, max_val)

## Splitting Dataset into Training and Testing

In [None]:
X,y = df3.iloc[:,[0,2]], df3.iloc[:, [1]]
print(f"X shape: {X.shape}\ny shape: {y.shape}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.reset_index(inplace=True)
X_train.drop(['index'], axis=1, inplace=True)
X_test.reset_index(inplace=True)
X_test.drop(['index'], axis=1, inplace=True)
y_train.reset_index(inplace=True)
y_train.drop(['index'], axis=1, inplace=True)
y_test.reset_index(inplace=True)
y_test.drop(['index'], axis=1, inplace=True)

print(f"X_train shape: {X_train.shape}\tX_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}\ty_test shape: {y_test.shape}")

## Transformation

In [None]:
X_train.skew()

In [None]:
def draw_qq_hist(feature):
    plt.figure(figsize=(20,80), facecolor='white')
    ax = plt.subplot(10,3,1)
    stats.probplot(X_train[feature], dist="norm", plot=plt)
    plt.xlabel(feature)
    ax = plt.subplot(10,3,2)
#     ax.set_title("Hist")
    sns.distplot(X_train[feature])
    plt.xlabel(feature)
    plt.show()

In [None]:
for feature in X_train.columns:
    draw_qq_hist(feature)

### Humidity
- Left Skewed

In [None]:
# create columns variables to hold the columns that need transformation
columns = ['Humidity']
# create the function transformer object with exponential transformation
# exponential_transformer = FunctionTransformer(np.exp)
exponential_transformer = FunctionTransformer(lambda x: np.exp(x*3), validate=True) # 3 is best found value for get minimum skew
# apply the transformation 
data_new = exponential_transformer.transform(X_train[columns])
df_new = pd.DataFrame(data_new, columns=columns)
# replace new values with previous data frame
X_train['Humidity']=df_new['Humidity']

X_train.skew()

In [None]:
draw_qq_hist('Humidity')

## Feature Scaling

In [None]:
scaler_x = StandardScaler().fit(X_train)
scaler_y = StandardScaler().fit(y_train)
# scaler = MinMaxScaler().fit(X_train)

In [None]:
X_train_scale = scaler_x.transform(X_train)
X_test_scale = scaler_x.transform(X_test)
y_train_scale = scaler_y.transform(y_train)
y_test_scale = scaler_y.transform(y_test)

X_train = pd.DataFrame(X_train_scale, columns=[X_train.columns])
X_test = pd.DataFrame(X_test_scale, columns=[X_test.columns])   
y_train = pd.DataFrame(y_train_scale, columns=['Apparent Temperature (C)'])
y_test = pd.DataFrame(y_test_scale, columns=['Apparent Temperature (C)'])   

In [None]:
#plot a univariate distribution of continues observations
plt.figure(figsize=(25,50), facecolor='white')
plotnumber =1
for feature in ['Temperature (C)', 'Humidity']:
    ax = plt.subplot(10,3,plotnumber)
    sns.distplot(X_train[feature])
    plt.xlabel(feature)
    plotnumber+=1
plt.show()

## Model Building and Training

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
y_hat = pd.DataFrame(predictions, columns=["predicted"])
y_hat.head()

##  R2 Score (coefficient of determination) regression score function.
- R Square measures how much variability in dependent variable can be explained by the model.

In [None]:
# coefficient of determination R^2 of the prediction.
R_Square_Score = model.score(X_test, y_test)
R_Square_Score

###### We can **improve** this value more than this by removing outliers in target(Apparent Temprature (C)) varibale.

In [None]:
# mean squared error
mse_error = mean_squared_error(y_test, y_hat)
mse_error

In [None]:
plt.figure(figsize=(12, 6))

plt.plot(y_hat[:200], label = "Predicted")
plt.plot(y_test[:200], label = "Actual")

plt.xlabel('x - axis')
plt.ylabel('y - axis')
plt.title('Predicitons vs Actual')
plt.legend()

plt.show()

## END