# Scaling ClimateWins Dataset

### This Script Contains the Following Points:
#### 1. Importing Libraries & Dataset
#### 2. Scaling Dataset
#### 3. Saving Scaled Dataset

## 1. Importing Libraries & Dataset

In [76]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.preprocessing import StandardScaler

In [77]:
# Creating folder path to project folder
path = r'/Users/C SaiVishwanath/Desktop/ClimateWins'

In [78]:
# importing dataset
df = pd.read_csv(os.path.join(path, '01 Data/Original Data/original_weather_prediction_dataset.csv'))

In [79]:
# Checking dataset
df.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,0.7,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,1.1,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,0.0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,4.1,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,5.4,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [80]:
# Printing Column data types
df.dtypes

DATE                     int64
MONTH                    int64
BASEL_cloud_cover        int64
BASEL_wind_speed       float64
BASEL_humidity         float64
                        ...   
VALENTIA_snow_depth      int64
VALENTIA_sunshine      float64
VALENTIA_temp_mean     float64
VALENTIA_temp_min      float64
VALENTIA_temp_max      float64
Length: 170, dtype: object

In [81]:
# Checking for missing values
df.isnull().sum()

DATE                   0
MONTH                  0
BASEL_cloud_cover      0
BASEL_wind_speed       0
BASEL_humidity         0
                      ..
VALENTIA_snow_depth    0
VALENTIA_sunshine      0
VALENTIA_temp_mean     0
VALENTIA_temp_min      0
VALENTIA_temp_max      0
Length: 170, dtype: int64

In [82]:
# No missing values

In [83]:
# Printing Shape of Dataset
df.shape

(22950, 170)

In [84]:
# Checking for Duplicates
df_dups=df[df.duplicated()]

In [85]:
print(df_dups)

Empty DataFrame
Columns: [DATE, MONTH, BASEL_cloud_cover, BASEL_wind_speed, BASEL_humidity, BASEL_pressure, BASEL_global_radiation, BASEL_precipitation, BASEL_snow_depth, BASEL_sunshine, BASEL_temp_mean, BASEL_temp_min, BASEL_temp_max, BELGRADE_cloud_cover, BELGRADE_humidity, BELGRADE_pressure, BELGRADE_global_radiation, BELGRADE_precipitation, BELGRADE_sunshine, BELGRADE_temp_mean, BELGRADE_temp_min, BELGRADE_temp_max, BUDAPEST_cloud_cover, BUDAPEST_humidity, BUDAPEST_pressure, BUDAPEST_global_radiation, BUDAPEST_precipitation, BUDAPEST_sunshine, BUDAPEST_temp_mean, BUDAPEST_temp_min, BUDAPEST_temp_max, DEBILT_cloud_cover, DEBILT_wind_speed, DEBILT_humidity, DEBILT_pressure, DEBILT_global_radiation, DEBILT_precipitation, DEBILT_sunshine, DEBILT_temp_mean, DEBILT_temp_min, DEBILT_temp_max, DUSSELDORF_cloud_cover, DUSSELDORF_wind_speed, DUSSELDORF_humidity, DUSSELDORF_pressure, DUSSELDORF_global_radiation, DUSSELDORF_precipitation, DUSSELDORF_snow_depth, DUSSELDORF_sunshine, DUSSELDORF_

In [86]:
# No duplicates found

## 2. Scaling All Columns in Dataset

In [88]:
# Creating scaler with StandardScaler from sklearn.preprocessing
# StandardScaler assumes data is normally distributed and scales with a distribution
# End data should have values arount 0 and standard deviation of 1 
    
scaler = StandardScaler()

In [89]:
# Scaling all columns in the dataset
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [90]:
# Checking output of df_scaled
df_scaled.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,-1.707663,-1.599964,0.660514,-0.02793,0.826097,-0.001949,-1.101066,-0.265148,-0.179228,-0.902918,...,-0.443701,0.761754,-1.299744,-0.806427,-0.088407,-0.024706,0.372147,-0.668215,-0.519743,-0.752237
1,-1.707657,-1.599964,0.244897,-0.02793,0.73576,-0.001949,-1.058108,1.65876,-0.179228,-0.810126,...,0.783085,1.18358,-1.262455,-1.042055,0.503361,-0.024706,-0.829285,-0.548046,-0.629054,-0.407141
2,-1.707652,-1.599964,1.07613,-0.02793,1.277781,-0.001949,-1.25142,0.155707,-0.179228,-1.065304,...,0.783085,1.18358,-0.432779,-1.136306,-0.396127,-0.024706,-1.0095,-0.067372,0.054135,-0.177078
3,-1.707646,-1.599964,-1.001953,-0.02793,1.458455,-0.001949,-0.821838,-0.445514,-0.179228,-0.114186,...,0.783085,0.480538,0.387574,-1.183432,0.669056,-0.024706,-1.039536,-0.998679,-0.164486,-0.838511
4,-1.707641,-1.599964,0.244897,-0.02793,1.729466,-0.001949,-0.746661,-0.164944,-0.179228,0.187388,...,-1.670486,-0.363113,1.72997,-0.794645,-0.49081,-0.024706,0.672505,-1.509396,-1.339569,-1.471186


In [91]:
# Finding average mean of all scaled features to check if the values are around 0 
print(df_scaled.mean().mean())  

1.8691289705283285e-16


In [92]:
# Values are correctly centered around 0 after scaling

In [93]:
# Finding average standard deviation of all features to check if close to 1
df_scaled.std().mean()

1.000021787204379

In [94]:
#  Output is very close to 1, indicating that the scaling process was successful

## 3. Saving Scaled Dataset

In [96]:
# Exporting scaled dataset to Prepared Data folder as pkl file

df_scaled.to_pickle(os.path.join(path, '01 Data/Prepared Data/Dataset_scaled.pkl'))

In [97]:
# Exporting scaled dataset to Prepared Data folder as csv file

df_scaled.to_csv(os.path.join(path, '01 Data/Prepared Data/Dataset_scaled.csv'))