In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import pickle

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = pd.read_csv("weather_data.csv")

In [3]:
data.head()

Unnamed: 0,Temperature (C),Humidity,Wind Speed (km/h),Visibility (km),Summary
0,9.472222,0.89,14.1197,15.8263,Partly Cloudy
1,9.355556,0.86,14.2646,15.8263,Partly Cloudy
2,9.377778,0.89,3.9284,14.9569,Mostly Cloudy
3,8.288889,0.83,14.1036,15.8263,Partly Cloudy
4,8.755556,0.83,11.0446,15.8263,Mostly Cloudy


In [4]:
data['Summary'].unique()

array(['Partly Cloudy', 'Mostly Cloudy', 'Overcast', 'Foggy',
       'Breezy and Mostly Cloudy', 'Clear', 'Breezy and Partly Cloudy',
       'Breezy and Overcast', 'Humid and Mostly Cloudy',
       'Humid and Partly Cloudy', 'Windy and Foggy', 'Windy and Overcast',
       'Breezy and Foggy', 'Windy and Partly Cloudy', 'Breezy',
       'Dry and Partly Cloudy', 'Windy and Mostly Cloudy',
       'Dangerously Windy and Partly Cloudy', 'Dry', 'Windy',
       'Humid and Overcast', 'Light Rain', 'Drizzle', 'Windy and Dry',
       'Dry and Mostly Cloudy', 'Breezy and Dry', 'Rain'], dtype=object)

In [5]:
std_feature = []
for val in data['Summary']:
    l = val.split()
    if 'and' in l:
        l.remove('and')
    s = " ".join(l)
    std_feature.append(s)
    
new_df = pd.DataFrame(std_feature, columns=["std_feature"])
new_df['std_feature'].unique()

array(['Partly Cloudy', 'Mostly Cloudy', 'Overcast', 'Foggy',
       'Breezy Mostly Cloudy', 'Clear', 'Breezy Partly Cloudy',
       'Breezy Overcast', 'Humid Mostly Cloudy', 'Humid Partly Cloudy',
       'Windy Foggy', 'Windy Overcast', 'Breezy Foggy',
       'Windy Partly Cloudy', 'Breezy', 'Dry Partly Cloudy',
       'Windy Mostly Cloudy', 'Dangerously Windy Partly Cloudy', 'Dry',
       'Windy', 'Humid Overcast', 'Light Rain', 'Drizzle', 'Windy Dry',
       'Dry Mostly Cloudy', 'Breezy Dry', 'Rain'], dtype=object)

In [6]:
data.describe()

Unnamed: 0,Temperature (C),Humidity,Wind Speed (km/h),Visibility (km)
count,96453.0,96453.0,96453.0,96453.0
mean,11.932678,0.734899,10.81064,10.347325
std,9.551546,0.195473,6.913571,4.192123
min,-21.822222,0.0,0.0,0.0
25%,4.688889,0.6,5.8282,8.3398
50%,12.0,0.78,9.9659,10.0464
75%,18.838889,0.89,14.1358,14.812
max,39.905556,1.0,63.8526,16.1


In [7]:
data.isnull().sum()

Temperature (C)      0
Humidity             0
Wind Speed (km/h)    0
Visibility (km)      0
Summary              0
dtype: int64

In [8]:
data['Summary'].value_counts()

Summary
Partly Cloudy                          31733
Mostly Cloudy                          28094
Overcast                               16597
Clear                                  10890
Foggy                                   7148
Breezy and Overcast                      528
Breezy and Mostly Cloudy                 516
Breezy and Partly Cloudy                 386
Dry and Partly Cloudy                     86
Windy and Partly Cloudy                   67
Light Rain                                63
Breezy                                    54
Windy and Overcast                        45
Humid and Mostly Cloudy                   40
Drizzle                                   39
Breezy and Foggy                          35
Windy and Mostly Cloudy                   35
Dry                                       34
Humid and Partly Cloudy                   17
Dry and Mostly Cloudy                     14
Rain                                      10
Windy                                      8
Hu

In [9]:
data['std_feature'] = std_feature

In [10]:
data.head()

Unnamed: 0,Temperature (C),Humidity,Wind Speed (km/h),Visibility (km),Summary,std_feature
0,9.472222,0.89,14.1197,15.8263,Partly Cloudy,Partly Cloudy
1,9.355556,0.86,14.2646,15.8263,Partly Cloudy,Partly Cloudy
2,9.377778,0.89,3.9284,14.9569,Mostly Cloudy,Mostly Cloudy
3,8.288889,0.83,14.1036,15.8263,Partly Cloudy,Partly Cloudy
4,8.755556,0.83,11.0446,15.8263,Mostly Cloudy,Mostly Cloudy


In [11]:
new_features = []
for val in data['std_feature']:
    if 'Dry' in val:
        new_features.append('Dry')
    elif 'Foggy' in val:
        new_features.append('Foggy')
    elif 'Rain' in val:
        new_features.append('Rain')
    elif 'Windy' in val:
        new_features.append('Windy')
    elif 'Breezy' in val:
        new_features.append('Breezy')
    elif 'Partly+Cloudy' in val:
        new_features.append('Partly+Cloudy')
    elif 'Mostly+Cloudy' in val:
        new_features.append('Mostly+Cloudy')
    elif 'Drizzle' in val:
        new_features.append('Rain')
    elif 'Overcast' in val:
        new_features.append('Overcast')
    else:
        new_features.append(val)
data['new_features'] = new_features 

In [12]:
data['new_features'].value_counts()

new_features
Partly Cloudy          31733
Mostly Cloudy          28094
Overcast               16604
Clear                  10890
Foggy                   7187
Breezy                  1484
Windy                    156
Dry                      136
Rain                     112
Humid Mostly Cloudy       40
Humid Partly Cloudy       17
Name: count, dtype: int64

In [13]:
data.head()
# data.shape

Unnamed: 0,Temperature (C),Humidity,Wind Speed (km/h),Visibility (km),Summary,std_feature,new_features
0,9.472222,0.89,14.1197,15.8263,Partly Cloudy,Partly Cloudy,Partly Cloudy
1,9.355556,0.86,14.2646,15.8263,Partly Cloudy,Partly Cloudy,Partly Cloudy
2,9.377778,0.89,3.9284,14.9569,Mostly Cloudy,Mostly Cloudy,Mostly Cloudy
3,8.288889,0.83,14.1036,15.8263,Partly Cloudy,Partly Cloudy,Partly Cloudy
4,8.755556,0.83,11.0446,15.8263,Mostly Cloudy,Mostly Cloudy,Mostly Cloudy


In [14]:
label_encoder = LabelEncoder()
target = label_encoder.fit_transform(data['new_features'])
data['target'] = target

In [15]:
data['target'].unique()

array([ 8,  6,  7,  3,  0,  1,  4,  5, 10,  2,  9])

In [16]:
data.tail(20)

Unnamed: 0,Temperature (C),Humidity,Wind Speed (km/h),Visibility (km),Summary,std_feature,new_features,target
96433,15.011111,0.93,3.2039,15.8263,Clear,Clear,Clear,1
96434,15.016667,0.9,2.7048,14.9569,Clear,Clear,Clear,1
96435,13.872222,0.93,4.7495,15.8263,Clear,Clear,Clear,1
96436,16.072222,0.88,2.7853,15.7297,Clear,Clear,Clear,1
96437,19.561111,0.75,3.7191,14.9569,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
96438,22.138889,0.65,7.7763,16.1,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
96439,22.872222,0.59,6.4239,16.1,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
96440,27.072222,0.42,12.0106,15.5526,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
96441,28.866667,0.37,13.9265,16.1,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
96442,30.994444,0.33,15.617,16.1,Partly Cloudy,Partly Cloudy,Partly Cloudy,8


In [17]:
df = pd.DataFrame(data)
X = df.drop(['target', 'Summary', 'std_feature', 'new_features'],axis=1)
y = df['target']

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Support Vector Machine classifier
clf = GaussianNB()

# Train the model
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.47

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.97      0.84       267
           1       0.51      0.02      0.04      2183
           2       0.09      0.25      0.13        24
           3       0.77      0.98      0.86      1451
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00         4
           6       0.38      0.32      0.35      5618
           7       0.36      0.55      0.44      3285
           8       0.52      0.59      0.55      6392
           9       0.00      0.00      0.00        24
          10       0.97      1.00      0.98        30

    accuracy                           0.47     19291
   macro avg       0.40      0.43      0.38     19291
weighted avg       0.47      0.47      0.44     19291



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
encoded_labels = clf.predict([[0.0,12.8,5.0,4.7]])
decoded_labels = label_encoder.inverse_transform(encoded_labels)
print("Decoded Labels:", decoded_labels[0])

Decoded Labels: Partly Cloudy




In [20]:
# encoded_labels = clf.predict([[8.755556,	0.83,	11.0446,	15.8263]])
encoded_labels = clf.predict([[16.07,0.88,2.78,15.72]])
decoded_labels = label_encoder.inverse_transform(encoded_labels)
print("Decoded Labels:", decoded_labels[0])

Decoded Labels: Partly Cloudy




In [21]:
# pickle.dump(clf, open("model.pkl", "wb"))

In [22]:
df.tail(10)

Unnamed: 0,Temperature (C),Humidity,Wind Speed (km/h),Visibility (km),Summary,std_feature,new_features,target
96443,30.894444,0.28,14.7798,15.5526,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
96444,31.083333,0.28,15.5043,16.1,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
96445,31.083333,0.28,13.8943,16.1,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
96446,30.766667,0.28,14.2163,15.5526,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
96447,28.838889,0.32,12.2038,16.1,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
96448,26.016667,0.43,10.9963,16.1,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
96449,24.583333,0.48,10.0947,15.5526,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
96450,22.038889,0.56,8.9838,16.1,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
96451,21.522222,0.6,10.5294,16.1,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
96452,20.438889,0.61,5.8765,15.5204,Partly Cloudy,Partly Cloudy,Partly Cloudy,8


In [23]:
clf.predict([[0.0,7.2,0.6,2.6]])[0]



8

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [25]:
# clf = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', solver='adam', random_state=42)
# # Train the classifier
# clf.fit(X_train, y_train)

# # Predict on the test set
# predictions = clf.predict(X_test)

# # Calculate accuracy
# accuracy = accuracy_score(y_test, predictions)
# print("Accuracy:", accuracy)

In [26]:
df.head()

Unnamed: 0,Temperature (C),Humidity,Wind Speed (km/h),Visibility (km),Summary,std_feature,new_features,target
0,9.472222,0.89,14.1197,15.8263,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
1,9.355556,0.86,14.2646,15.8263,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
2,9.377778,0.89,3.9284,14.9569,Mostly Cloudy,Mostly Cloudy,Mostly Cloudy,6
3,8.288889,0.83,14.1036,15.8263,Partly Cloudy,Partly Cloudy,Partly Cloudy,8
4,8.755556,0.83,11.0446,15.8263,Mostly Cloudy,Mostly Cloudy,Mostly Cloudy,6


In [27]:
new_df = pd.DataFrame(data.drop(["Summary",	"std_feature", "target"], axis=1))

In [28]:
new_df.head()

Unnamed: 0,Temperature (C),Humidity,Wind Speed (km/h),Visibility (km),new_features
0,9.472222,0.89,14.1197,15.8263,Partly Cloudy
1,9.355556,0.86,14.2646,15.8263,Partly Cloudy
2,9.377778,0.89,3.9284,14.9569,Mostly Cloudy
3,8.288889,0.83,14.1036,15.8263,Partly Cloudy
4,8.755556,0.83,11.0446,15.8263,Mostly Cloudy


In [29]:
new_data = new_df.to_csv("To_import.csv", index=False)