# Weather Dataset with KNN (Classification)

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

### Understanding Data

In [2]:
df = pd.read_csv("weather.csv")
df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,8.0,24.3,0.0,3.4,6.3,NW,30.0,SW,NW,6.0,...,29,1019.7,1015.0,7,7,14.4,23.6,No,3.6,Yes
1,14.0,26.9,3.6,4.4,9.7,ENE,39.0,E,W,4.0,...,36,1012.4,1008.4,5,3,17.5,25.7,Yes,3.6,Yes
2,13.7,23.4,3.6,5.8,3.3,NW,85.0,N,NNE,6.0,...,69,1009.5,1007.2,8,7,15.4,20.2,Yes,39.8,Yes
3,13.3,15.5,39.8,7.2,9.1,NW,54.0,WNW,W,30.0,...,56,1005.5,1007.0,2,7,13.5,14.1,Yes,2.8,Yes
4,7.6,16.1,2.8,5.6,10.6,SSE,50.0,SSE,ESE,20.0,...,49,1018.3,1018.5,7,7,11.1,15.4,Yes,0.0,No


In [3]:
df.shape

(366, 22)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MinTemp        366 non-null    float64
 1   MaxTemp        366 non-null    float64
 2   Rainfall       366 non-null    float64
 3   Evaporation    366 non-null    float64
 4   Sunshine       363 non-null    float64
 5   WindGustDir    363 non-null    object 
 6   WindGustSpeed  364 non-null    float64
 7   WindDir9am     335 non-null    object 
 8   WindDir3pm     365 non-null    object 
 9   WindSpeed9am   359 non-null    float64
 10  WindSpeed3pm   366 non-null    int64  
 11  Humidity9am    366 non-null    int64  
 12  Humidity3pm    366 non-null    int64  
 13  Pressure9am    366 non-null    float64
 14  Pressure3pm    366 non-null    float64
 15  Cloud9am       366 non-null    int64  
 16  Cloud3pm       366 non-null    int64  
 17  Temp9am        366 non-null    float64
 18  Temp3pm   

**From above information we can clearly observe that we have 5 columns with non-numerical data**


**They are**

- WindGustDir
- WindDir3pm
- WindDir9am
- RainToday
- RainTomorrow

### Data Transformation

In [5]:
#Print all unique values of specified column

print(list(dict.fromkeys(df["RainToday"]))) 
print(list(dict.fromkeys(df["RainTomorrow"]))) 
print(list(dict.fromkeys(df["WindGustDir"]))) 
print(list(dict.fromkeys(df["WindDir3pm"]))) 
print(list(dict.fromkeys(df["WindDir9am"]))) 

['No', 'Yes']
['Yes', 'No']
['NW', 'ENE', 'SSE', 'SE', 'E', 'S', 'N', 'WNW', 'ESE', 'NE', 'NNE', 'NNW', 'SW', 'W', 'WSW', 'SSW', nan]
['NW', 'W', 'NNE', 'ESE', 'E', 'ENE', 'WSW', 'NE', 'NNW', 'SE', 'S', 'SW', 'WNW', 'N', 'SSW', 'SSE', nan]
['SW', 'E', 'N', 'WNW', 'SSE', 'SE', 'S', 'WSW', 'NNE', 'NNW', 'ENE', 'SSW', 'NW', 'ESE', 'NE', 'W', nan]


In [6]:
RainTomorrow = {"Yes": 1, "No": 0}

for key, value in RainTomorrow.items():
    df["RainTomorrow"] = df["RainTomorrow"].replace(key, value)
    
RainToday = {"Yes": 1, "No": 0}

for key, value in RainToday.items():
    df["RainToday"] = df["RainToday"].replace(key, value)

  df["RainTomorrow"] = df["RainTomorrow"].replace(key, value)
  df["RainToday"] = df["RainToday"].replace(key, value)


In [7]:
WindGustDir = {
    'N': 1, 'E': 2, 'S': 3, 'W': 4,
    'NE': 5, 'NW': 6, 'SE': 7, 'SW': 8,
    'NNE': 9, 'ENE': 10, 'NNW': 11, 'WNW': 12, 
    'ESE': 13, 'SSE': 14, 'SSW': 15, 'WSW': 16,
}
WindDir3pm = {
    'N': 1, 'E': 2, 'S': 3, 'W': 4,
    'NE': 5, 'NW': 6, 'SE': 7, 'SW': 8,
    'NNE': 9, 'ENE': 10, 'NNW': 11, 'WNW': 12, 
    'ESE': 13, 'SSE': 14, 'SSW': 15, 'WSW': 16,
}
WindDir9am = {
    'N': 1, 'E': 2, 'S': 3, 'W': 4,
    'NE': 5, 'NW': 6, 'SE': 7, 'SW': 8,
    'NNE': 9, 'ENE': 10, 'NNW': 11, 'WNW': 12, 
    'ESE': 13, 'SSE': 14, 'SSW': 15, 'WSW': 16,
}


for key, value in WindGustDir.items():
    df["WindGustDir"] = df["WindGustDir"].replace(key, value)
    
for key, value in WindDir3pm.items():
    df["WindDir3pm"] = df["WindDir3pm"].replace(key, value)
    
for key, value in WindDir9am.items():
    df["WindDir9am"] = df["WindDir9am"].replace(key, value)

  df["WindGustDir"] = df["WindGustDir"].replace(key, value)
  df["WindDir3pm"] = df["WindDir3pm"].replace(key, value)
  df["WindDir9am"] = df["WindDir9am"].replace(key, value)


 **From df.info() we also found some null values spreaded over the data frame**

In [8]:
#Droping all rows with null values
df.dropna(inplace=True)
df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,8.0,24.3,0.0,3.4,6.3,6.0,30.0,8.0,6.0,6.0,...,29,1019.7,1015.0,7,7,14.4,23.6,0,3.6,1
1,14.0,26.9,3.6,4.4,9.7,10.0,39.0,2.0,4.0,4.0,...,36,1012.4,1008.4,5,3,17.5,25.7,1,3.6,1
2,13.7,23.4,3.6,5.8,3.3,6.0,85.0,1.0,9.0,6.0,...,69,1009.5,1007.2,8,7,15.4,20.2,1,39.8,1
3,13.3,15.5,39.8,7.2,9.1,6.0,54.0,12.0,4.0,30.0,...,56,1005.5,1007.0,2,7,13.5,14.1,1,2.8,1
4,7.6,16.1,2.8,5.6,10.6,14.0,50.0,14.0,13.0,20.0,...,49,1018.3,1018.5,7,7,11.1,15.4,1,0.0,0


### Data Modelling

In [9]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [11]:
knn = KNeighborsClassifier(n_neighbors=3)

In [12]:
knn.fit(X_train, y_train)

In [13]:
y_pred = knn.predict(X_test)

In [14]:
print("Accuracy: {:.2f}".format(accuracy_score(y_test, y_pred) * 100))

Accuracy: 90.91
