In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [65]:
# Reading Weather Dataset
weather = pd.read_csv("seattle-weather.csv")

In [66]:
weather.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


# Data Analysis


Things you can try:

Check the data dimensions: Find out how many samples and features are in the dataset.

Check the target distribution: Check how many samples are there in each class to see if the dataset is balanced or imbalanced.

Check for missing values: Find out if there are any missing values in the dataset, and decide how to handle them.

Check for outliers: Look for any data points that are significantly different from the others, and decide how to handle them.

Visualize the data: Use various visualization techniques to better understand the data, such as histograms, boxplots, scatterplots, and heatmaps.

Check the correlation between features: Find out if there are any strong correlations between the features, and decide how to handle them.

Feature engineering: If needed, create new features based on the existing ones, to improve the performance of the model.

Split the data: Divide the data into training, validation, and testing sets.

Normalize/Standardize the data: If required, scale the data to have a mean of zero and a standard deviation of one.

Evaluate baselines: Determine the accuracy of simple models to establish a baseline performance for the dataset.

Explore feature importance: Use methods like feature importance or permutation importance to understand the importance of each feature in the model.

Determine model requirements: Understand the requirements of the model, such as the number of samples needed, the number of features, and the complexity of the model.

In [67]:
#Null Values
weather.isna().sum()

date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64

In [68]:
#Total (rows,columns) of data
weather.shape

(1461, 6)

In [69]:
# Datatypes of variables/columns of dataset
weather.dtypes

date              object
precipitation    float64
temp_max         float64
temp_min         float64
wind             float64
weather           object
dtype: object

In [70]:
# stats of dataset
weather.describe()

Unnamed: 0,precipitation,temp_max,temp_min,wind
count,1461.0,1461.0,1461.0,1461.0
mean,3.029432,16.439083,8.234771,3.241136
std,6.680194,7.349758,5.023004,1.437825
min,0.0,-1.6,-7.1,0.4
25%,0.0,10.6,4.4,2.2
50%,0.0,15.6,8.3,3.0
75%,2.8,22.2,12.2,4.0
max,55.9,35.6,18.3,9.5


In [71]:
# info about dataset
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1461 non-null   object 
 1   precipitation  1461 non-null   float64
 2   temp_max       1461 non-null   float64
 3   temp_min       1461 non-null   float64
 4   wind           1461 non-null   float64
 5   weather        1461 non-null   object 
dtypes: float64(4), object(2)
memory usage: 68.6+ KB


In [72]:
# dimension of dataset
weather.ndim

2

In [73]:
# size of dataset
weather.size

8766

In [74]:
# We have to predict the weather. checking unique values of target
weather.weather.unique()

array(['drizzle', 'rain', 'sun', 'snow', 'fog'], dtype=object)

In [75]:
#number of unique elements
weather.weather.nunique()

5

In [76]:
# checking if dataset (target) is balanced : frequency of all unique elements of target
weather.weather.value_counts()

rain       641
sun        640
fog        101
drizzle     53
snow        26
Name: weather, dtype: int64

# Label Encoding

In [77]:
from sklearn.preprocessing import LabelEncoder

weather['weather_label'] = LabelEncoder().fit_transform(weather['weather'])  #weather['weather'].apply(LabelEncoder().fit_transform)

weather.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather,weather_label
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle,0
1,2012-01-02,10.9,10.6,2.8,4.5,rain,2
2,2012-01-03,0.8,11.7,7.2,2.3,rain,2
3,2012-01-04,20.3,12.2,5.6,4.7,rain,2
4,2012-01-05,1.3,8.9,2.8,6.1,rain,2


In [78]:
weather.corrwith(weather['weather_label']).sort_values(ascending=False)

  weather.corrwith(weather['weather_label']).sort_values(ascending=False)


weather_label    1.000000
temp_max         0.322337
temp_min         0.154981
wind            -0.065858
precipitation   -0.267388
dtype: float64

In [79]:
# Dictionary of weather and label for future use
weather_dict = dict(zip(weather.weather,weather.weather_label))
weather_dict

{'drizzle': 0, 'rain': 2, 'sun': 4, 'snow': 3, 'fog': 1}

In [80]:
# Making date separate columns and making it our index
weather['date'] = pd.to_datetime(weather['date'])
weather['year'] = weather.date.dt.year
weather['month'] = weather.date.dt.month
weather['day'] = weather.date.dt.day

weather.head(10)


Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather,weather_label,year,month,day
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle,0,2012,1,1
1,2012-01-02,10.9,10.6,2.8,4.5,rain,2,2012,1,2
2,2012-01-03,0.8,11.7,7.2,2.3,rain,2,2012,1,3
3,2012-01-04,20.3,12.2,5.6,4.7,rain,2,2012,1,4
4,2012-01-05,1.3,8.9,2.8,6.1,rain,2,2012,1,5
5,2012-01-06,2.5,4.4,2.2,2.2,rain,2,2012,1,6
6,2012-01-07,0.0,7.2,2.8,2.3,rain,2,2012,1,7
7,2012-01-08,0.0,10.0,2.8,2.0,sun,4,2012,1,8
8,2012-01-09,4.3,9.4,5.0,3.4,rain,2,2012,1,9
9,2012-01-10,1.0,6.1,0.6,3.4,rain,2,2012,1,10


In [81]:
#finalizing our dataset
weather_final = weather.drop(['weather'],axis=1)
weather_final.set_index('date',inplace=True)

weather_final.head()

Unnamed: 0_level_0,precipitation,temp_max,temp_min,wind,weather_label,year,month,day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-01-01,0.0,12.8,5.0,4.7,0,2012,1,1
2012-01-02,10.9,10.6,2.8,4.5,2,2012,1,2
2012-01-03,0.8,11.7,7.2,2.3,2,2012,1,3
2012-01-04,20.3,12.2,5.6,4.7,2,2012,1,4
2012-01-05,1.3,8.9,2.8,6.1,2,2012,1,5


In [82]:
# Rearranging columns of dataset
cols = list(weather_final.columns)
cols = cols[-3:]+cols[:-3]
weather_final = weather_final[cols]
weather_final.head()

Unnamed: 0_level_0,year,month,day,precipitation,temp_max,temp_min,wind,weather_label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-01-01,2012,1,1,0.0,12.8,5.0,4.7,0
2012-01-02,2012,1,2,10.9,10.6,2.8,4.5,2
2012-01-03,2012,1,3,0.8,11.7,7.2,2.3,2
2012-01-04,2012,1,4,20.3,12.2,5.6,4.7,2
2012-01-05,2012,1,5,1.3,8.9,2.8,6.1,2


# Model 


In [83]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# Since our dataset is imbalanced, we use the below samplers 
# SMOTE: 
'''
SMOTE (Synthetic Minority Over-sampling Technique) is a technique used to handle imbalanced datasets in machine learning.
It is particularly useful when the number of instances of one class is much lower than the number of instances of the other class(es). 
SMOTE works by creating synthetic examples of the minority class by interpolating between the existing minority class samples.
'''

from imblearn.over_sampling import SMOTE

'''
RandomUnderSampler is a technique used to handle imbalanced datasets in machine learning.
It is used to reduce the number of instances of the majority class in the dataset by randomly selecting a subset of the majority class samples,
so that the number of samples in the majority class becomes equal to the number of samples in the minority class.
'''

from imblearn.under_sampling import RandomUnderSampler 
from sklearn.pipeline import Pipeline

In [84]:
# Data Split
X = weather_final.drop(['weather_label'], axis = 1)
y = weather_final['weather_label']

    # strategy building according to number of classes
over_strategy = {0 : 1000, 1 : 1000, 2 : 1000, 3 : 1000, 4 : 2000}
under_strategy = {0 : 1000, 1 : 1000, 2 : 1000, 3 : 1000, 4 : 2000}

    # sampling strategy defined
oversample = SMOTE(sampling_strategy = over_strategy)
undersample = RandomUnderSampler(sampling_strategy = under_strategy)

    #since our data is sampled with interpolated instances, we fit our features and target
X_final,y = oversample.fit_resample(X,y)
X_final,y = undersample.fit_resample(X_final, y)


X_train,X_test,y_train,y_test = train_test_split(X_final,y,random_state = 10, test_size = 0.2)



In [85]:
from sklearn.metrics import accuracy_score, classification_report

In [86]:
# Gaussian Naive-Bayes
model = GaussianNB()

np.random.seed(10)
model.fit(X_train,y_train)

pred_all = model.predict(X_final)
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)


print(f'Accuracy Score on All Data : {round(accuracy_score(y, pred_all),2)*100}')
print(f'Accuracy Score on Train Data : {round(accuracy_score(y_train, pred_train),2)*100}')
print(f'Accuracy Score on Test Data : {round(accuracy_score(y_test, pred_test),2)*100}')

Accuracy Score on All Data : 73.0
Accuracy Score on Train Data : 73.0
Accuracy Score on Test Data : 73.0


"""
Data type: GaussianNB is used for continuous data and assumes a Gaussian distribution of the input features, while MultinomialNB is used for discrete data and assumes a multinomial distribution of the input features.

Input format: GaussianNB expects real-valued input features, while MultinomialNB expects integer counts of input features.

Scaling: GaussianNB does not require feature scaling, while MultinomialNB works better with scaled input features.

Number of classes: GaussianNB is used for binary and multiclass classification problems, while MultinomialNB is typically used for multiclass classification problems.

Handling negative values: GaussianNB can handle negative values, while MultinomialNB cannot handle negative values and requires non-negative input features.

Application: GaussianNB is commonly used in image processing, spam detection, and natural language processing, while MultinomialNB is commonly used in text classification, document classification, and sentiment analysis.

Handling missing values: GaussianNB can handle missing values by ignoring them during training, while MultinomialNB requires imputation of missing values before training.
"""

In [87]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

clf = DecisionTreeClassifier()

X = weather_final.drop(['weather_label'],axis=1)
y = weather_final['weather_label']

print(X.shape,y.shape)
print(X_r.shape,y_r.shape)


smote = SMOTE()
X_r,y_r = smote.fit_resample(X,y)

# X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 10, test_size = 0.2)
# accuracy = 0.79


X_train,X_test,y_train,y_test = train_test_split(X_r,y_r,random_state = 10, test_size = 0.2)

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

(1461, 7) (1461,)
(3205, 7) (3205,)
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       122
           1       0.73      0.80      0.76       124
           2       0.95      0.93      0.94       129
           3       0.98      1.00      0.99       125
           4       0.70      0.62      0.66       141

    accuracy                           0.84       641
   macro avg       0.84      0.85      0.84       641
weighted avg       0.84      0.84      0.84       641

0.8424336973478939
