In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import zscore

### Dataset 1

In [2]:
#Loading the Dataset
data = pd.read_csv('traffic_weather.csv')
#Checking for missing values
data.isnull().sum()

Flow           0
(mph)          0
Temperature    0
DewPoint       0
Humidity       0
Wind           0
WindSpeed      0
WindGust       0
Pressure       0
Precip.        0
Condition      0
dtype: int64

In [3]:
#Checking for duplicates
data.duplicated().sum()

4

In [4]:
#Removing duplicates
data.drop_duplicates(inplace=True)

In [5]:
#Encoding categorical variables
label_encoder = LabelEncoder()
data['Wind'] = label_encoder.fit_transform(data['Wind'])

In [6]:
#Categorising the features into X and y
X = data[['Wind', 'DewPoint','Flow','(mph)', 'Temperature', 'Humidity','WindSpeed','Pressure','WindGust','Precip.']]
y = data['Condition']

#Splitting the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#Scaling the values before fitting into the model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Creating a Knn model with k as 5 found by trial
knn = KNeighborsClassifier(n_neighbors=5)

#Training the model with the scaled data
knn.fit(X_train_scaled, y_train)

#Testing the model
test_predictions = knn.predict(X_test_scaled)

#Evaluating the model using the accuracy_score evaluation metric
test_accuracy = accuracy_score(y_test, test_predictions)
print(f'Test Accuracy: {test_accuracy:.2f}')

Test Accuracy: 0.99


### Dataset 2

In [7]:
#Loading the Dataset
data = pd.read_csv('seattle-weather.csv')
#Checking for missing values
data.isnull().sum()

precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64

In [8]:
#Checking for duplicates
data.duplicated().sum()

8

In [9]:
#Removing duplicates
data.drop_duplicates(inplace=True)

In [10]:
#Removing Outliers with the help of z score
z_scores = zscore(data[['precipitation', 'temp_max', 'temp_min', 'wind']])
new_data = data[(z_scores < 3).all(axis=1)]

#Catgorizing the features into X and y
X = new_data[['precipitation', 'temp_max', 'temp_min', 'wind']]
y = new_data['weather']

#Splitting the dataset into Training and Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

#Scaling the values before fitting into the model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Creating a KNN model with k as 34 found by trial
knn = KNeighborsClassifier(n_neighbors=34)

#Training the model
knn.fit(X_train_scaled, y_train)

#Testing the model
test_predictions = knn.predict(X_test_scaled)

#Evaluating the model on the basis of accuracy_Score Evaluation metric
test_accuracy = accuracy_score(y_test, test_predictions)
print(f'Test Accuracy: {test_accuracy:.2f}')

Test Accuracy: 0.78


### Dataset 3

In [11]:
#Loading the Dataset
data = pd.read_csv('weather3.csv')
#Checking for missing values
data.isnull().sum()

Summary                 0
Precip Type             0
Temperature             0
Apparent Temperature    0
Humidity                0
Wind Speed (km/h)       0
Pressure (millibars)    0
dtype: int64

In [12]:
#Checking for duplicates
data.duplicated().sum()

0

In [None]:
#Encoding categorical variables
label_encoder = LabelEncoder()
new_data['Precip Type'] = label_encoder.fit_transform(new_data['Precip Type'])

In [13]:
#Removing the Outliers by z-score method
z_scores = zscore(data[['Temperature', 'Apparent Temperature', 'Humidity','Wind Speed (km/h)','Pressure (millibars)']])
new_data = data[(z_scores < 3).all(axis=1)]

#Categorizing the features into X and y
X = new_data[['Precip Type', 'Temperature', 'Apparent Temperature', 'Humidity','Wind Speed (km/h)','Pressure (millibars)']]
y = new_data['Summary']

#Splitting the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Scaling the values before fitting the model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Creating a KNN model with k as 25 based on trial
knn = KNeighborsClassifier(n_neighbors=25)

#Training the model
knn.fit(X_train_scaled, y_train)

#Testing the model
test_predictions = knn.predict(X_test_scaled)

#Evaluating the model based on the Accuracy_score Evaluation metric
test_accuracy = accuracy_score(y_test, test_predictions)
print(f'Test Accuracy: {test_accuracy:.2f}')

Test Accuracy: 0.53


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['Precip Type'] = label_encoder.fit_transform(new_data['Precip Type'])
