In [90]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Loading Data and Exploration

In [92]:
# Load the data
weather_dataset = pd.read_csv('Weather Data.csv')

# Print first 5 rows
weather_dataset.head()

Unnamed: 0,Date/Time,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa,Weather
0,1/1/2012 0:00,-1.8,-3.9,86,4,8.0,101.24,Fog
1,1/1/2012 1:00,-1.8,-3.7,87,4,8.0,101.24,Fog
2,1/1/2012 2:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog"
3,1/1/2012 3:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog"
4,1/1/2012 4:00,-1.5,-3.3,88,7,4.8,101.23,Fog


Exploring the dataset

In [94]:
# Check number of rows and columns
weather_dataset.shape

(8784, 8)

In [95]:
# Check the dataset info and statistics
weather_dataset.info

<bound method DataFrame.info of              Date/Time  Temp_C  Dew Point Temp_C  Rel Hum_%  Wind Speed_km/h  \
0        1/1/2012 0:00    -1.8              -3.9         86                4   
1        1/1/2012 1:00    -1.8              -3.7         87                4   
2        1/1/2012 2:00    -1.8              -3.4         89                7   
3        1/1/2012 3:00    -1.5              -3.2         88                6   
4        1/1/2012 4:00    -1.5              -3.3         88                7   
...                ...     ...               ...        ...              ...   
8779  12/31/2012 19:00     0.1              -2.7         81               30   
8780  12/31/2012 20:00     0.2              -2.4         83               24   
8781  12/31/2012 21:00    -0.5              -1.5         93               28   
8782  12/31/2012 22:00    -0.2              -1.8         89               28   
8783  12/31/2012 23:00     0.0              -2.1         86               30   

      V

In [96]:
weather_dataset.describe

<bound method NDFrame.describe of              Date/Time  Temp_C  Dew Point Temp_C  Rel Hum_%  Wind Speed_km/h  \
0        1/1/2012 0:00    -1.8              -3.9         86                4   
1        1/1/2012 1:00    -1.8              -3.7         87                4   
2        1/1/2012 2:00    -1.8              -3.4         89                7   
3        1/1/2012 3:00    -1.5              -3.2         88                6   
4        1/1/2012 4:00    -1.5              -3.3         88                7   
...                ...     ...               ...        ...              ...   
8779  12/31/2012 19:00     0.1              -2.7         81               30   
8780  12/31/2012 20:00     0.2              -2.4         83               24   
8781  12/31/2012 21:00    -0.5              -1.5         93               28   
8782  12/31/2012 22:00    -0.2              -1.8         89               28   
8783  12/31/2012 23:00     0.0              -2.1         86               30   

     

In [97]:
# Unique values of the 'Weather' column
unique_values = weather_dataset['Weather'].unique()
print(unique_values)

['Fog' 'Freezing Drizzle,Fog' 'Mostly Cloudy' 'Cloudy' 'Rain'
 'Rain Showers' 'Mainly Clear' 'Snow Showers' 'Snow' 'Clear'
 'Freezing Rain,Fog' 'Freezing Rain' 'Freezing Drizzle' 'Rain,Snow'
 'Moderate Snow' 'Freezing Drizzle,Snow' 'Freezing Rain,Snow Grains'
 'Snow,Blowing Snow' 'Freezing Fog' 'Haze' 'Rain,Fog' 'Drizzle,Fog'
 'Drizzle' 'Freezing Drizzle,Haze' 'Freezing Rain,Haze' 'Snow,Haze'
 'Snow,Fog' 'Snow,Ice Pellets' 'Rain,Haze' 'Thunderstorms,Rain'
 'Thunderstorms,Rain Showers' 'Thunderstorms,Heavy Rain Showers'
 'Thunderstorms,Rain Showers,Fog' 'Thunderstorms' 'Thunderstorms,Rain,Fog'
 'Thunderstorms,Moderate Rain Showers,Fog' 'Rain Showers,Fog'
 'Rain Showers,Snow Showers' 'Snow Pellets' 'Rain,Snow,Fog'
 'Moderate Rain,Fog' 'Freezing Rain,Ice Pellets,Fog'
 'Drizzle,Ice Pellets,Fog' 'Drizzle,Snow' 'Rain,Ice Pellets'
 'Drizzle,Snow,Fog' 'Rain,Snow Grains' 'Rain,Snow,Ice Pellets'
 'Snow Showers,Fog' 'Moderate Snow,Blowing Snow']


In [98]:
# Unique value count of the 'Weather' column
count = weather_dataset['Weather'].nunique()
print(count)

50


In [99]:
# Occurrence count of unique values
occurrence_count = weather_dataset['Weather'].value_counts()
print(occurrence_count)

Weather
Mainly Clear                               2106
Mostly Cloudy                              2069
Cloudy                                     1728
Clear                                      1326
Snow                                        390
Rain                                        306
Rain Showers                                188
Fog                                         150
Rain,Fog                                    116
Drizzle,Fog                                  80
Snow Showers                                 60
Drizzle                                      41
Snow,Fog                                     37
Snow,Blowing Snow                            19
Rain,Snow                                    18
Thunderstorms,Rain Showers                   16
Haze                                         16
Drizzle,Snow,Fog                             15
Freezing Rain                                14
Freezing Drizzle,Snow                        11
Freezing Drizzle                

## Data Preprocessing

### Getting rid of missng data instances

In [102]:
missing_values = weather_dataset['Weather'].isnull().sum()
print(missing_values)

0


In [107]:
# 'Date/Time' column to datetime format
weather_dataset['Date/Time'] = pd.to_datetime(weather_dataset['Date/Time'])

# Extract time features that might be useful
weather_dataset['Hour'] = weather_dataset['Date/Time'].dt.hour
weather_dataset['Month'] = weather_dataset['Date/Time'].dt.month

### Encoding the catagorical variable 'Weather' using Label Encoder


In [109]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
weather_dataset['Weather_Encoded'] = LE.fit_transform(weather_dataset['Weather'])
print(weather_dataset[['Weather', 'Weather_Encoded']])

                   Weather  Weather_Encoded
0                      Fog                7
1                      Fog                7
2     Freezing Drizzle,Fog                9
3     Freezing Drizzle,Fog                9
4                      Fog                7
...                    ...              ...
8779                  Snow               35
8780                  Snow               35
8781                  Snow               35
8782                  Snow               35
8783                  Snow               35

[8784 rows x 2 columns]


In [111]:
weather_dataset

Unnamed: 0,Date/Time,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa,Weather,Hour,Month,Weather_Encoded
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,Fog,0,1,7
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,Fog,1,1,7
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog",2,1,9
3,2012-01-01 03:00:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog",3,1,9
4,2012-01-01 04:00:00,-1.5,-3.3,88,7,4.8,101.23,Fog,4,1,7
...,...,...,...,...,...,...,...,...,...,...,...
8779,2012-12-31 19:00:00,0.1,-2.7,81,30,9.7,100.13,Snow,19,12,35
8780,2012-12-31 20:00:00,0.2,-2.4,83,24,9.7,100.03,Snow,20,12,35
8781,2012-12-31 21:00:00,-0.5,-1.5,93,28,4.8,99.95,Snow,21,12,35
8782,2012-12-31 22:00:00,-0.2,-1.8,89,28,9.7,99.91,Snow,22,12,35


### Scaling the input variables

In [113]:
# For models that need numerical input
from sklearn.preprocessing import StandardScaler
features = ['Temp_C', 'Dew Point Temp_C', 'Rel Hum_%', 'Wind Speed_km/h', 
            'Visibility_km', 'Press_kPa', 'Hour', 'Month']

X = weather_dataset[features]
y = weather_dataset['Weather_Encoded']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [117]:
df = pd.read_csv('weather_classification_data.csv')
df

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy
...,...,...,...,...,...,...,...,...,...,...,...
13195,10.0,74,14.5,71.0,overcast,1003.15,1,Summer,1.0,mountain,Rainy
13196,-1.0,76,3.5,23.0,cloudy,1067.23,1,Winter,6.0,coastal,Snowy
13197,30.0,77,5.5,28.0,overcast,1012.69,3,Autumn,9.0,coastal,Cloudy
13198,3.0,76,10.0,94.0,overcast,984.27,0,Winter,2.0,inland,Snowy


In [129]:
missing = df['Weather Type'].isnull().sum()
missing

missing2 = df['Temperature'].isnull().sum()
missing2

0