In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

## Task 1: Preprocessing

>#### load 'weather_forecast_data.csv' dataset

In [None]:
df = pd.read_csv('weather_forecast_data.csv')

In [None]:
# get copy from the original to preprocess

df_pre = df.copy()

>#### check missing values

In [None]:
# to know the number of the rows
print(f"total records:",len(df), "\n")


# to get the number of missing values in each column
print("missing records in each column:","\n")
print(df_pre.isnull().sum())

print("-"*80)

print("Records with null values: ","\n")
print(df_pre[df_pre.isnull().any(axis=1)])

# according to the output there are missing values



>#### Handle missing values with dropping them

In [None]:

df_dropped_nulls= df_pre.dropna()
print(f"total records without nulls:",len(df_dropped_nulls), "\n")

df_dropped_nulls.head()

>#### Handle missing values with replacing them with Avg

In [None]:

# get the numerical features only because we can't get mean for categorical feature

df_numerical_features_only=df_pre.select_dtypes(include="number")


# replace the null values with the average of the numerical features

df_numerical_filled_avg = df_numerical_features_only.fillna(df_numerical_features_only.mean())


# concatenate the numerical features with the target column "Rain" and create a new dataframe "df_filledAvg"

df_filled_avg=pd.concat([df_numerical_filled_avg,df_pre["Rain"]], axis=1)


print("DataFrame after replacing null values with the average:")
print(df_filled_avg)

>#### determine targets & features

In [None]:
targets_columns=["Rain"]

df_targets_filled_avg = df_filled_avg[targets_columns]
df_features_filled_avg = df_filled_avg.drop(columns=targets_columns)

df_targets_dropped_nulls = df_dropped_nulls[targets_columns]
df_features_dropped_nulls = df_dropped_nulls.drop(columns=targets_columns)

print("Avg data:")
display(df_features_filled_avg.head())
display(df_targets_filled_avg.head())

print("Dropped nulls data:")
display(df_features_dropped_nulls.head())
display(df_targets_dropped_nulls.head())


>#### splitting data into train , test

In [None]:

# make the 80% from the data training set and 20% from the data testing set
# random state to ensure that the split return the same data each run

df_features_train_avg, df_features_test_avg, df_targets_train_avg, df_targets_test_avg = train_test_split(df_features_filled_avg, df_targets_filled_avg, test_size=0.2, random_state=42) 
df_features_train_dropped, df_features_test_dropped, df_targets_train_dropped, df_targets_test_dropped = train_test_split(df_features_dropped_nulls, df_targets_dropped_nulls, test_size=0.2, random_state=42) 

print(len(df_features_train_avg))
print(len(df_features_test_avg))
print(len(df_targets_train_avg))
print(len(df_targets_test_avg))

print(len(df_features_train_dropped))
print(len(df_features_test_dropped))
print(len(df_targets_train_dropped))
print(len(df_targets_test_dropped))


In [None]:
# final targets will be worked on 

label_encoder = LabelEncoder()
df_targets_train_avg = label_encoder.fit_transform(df_targets_train_avg)
df_targets_test_avg = label_encoder.transform(df_targets_test_avg)

df_targets_train_dropped = label_encoder.fit_transform(df_targets_train_dropped)
df_targets_test_dropped = label_encoder.transform(df_targets_test_dropped)

>#### check scaling of data

In [None]:

print("filled avg data:")
display(df_filled_avg.describe().T)

print("dropped nulls data:")
display(df_dropped_nulls.describe().T)

# according to the output from min, max the numeric features dosn't have the same scale

>#### features are scaled

In [None]:
scaler = RobustScaler()

# the scaler return ndarray

df_features_train_avg = scaler.fit_transform(df_features_train_avg)
df_features_test_avg = scaler.fit_transform(df_features_test_avg)


df_features_train_dropped = scaler.fit_transform(df_features_train_dropped)
df_features_test_dropped = scaler.fit_transform(df_features_test_dropped)


# convert the ndarray to DataFrame

# final features will be worked on

df_features_train_avg = pd.DataFrame(df_features_train_avg, columns=df_features_filled_avg.columns)
df_features_test_avg = pd.DataFrame(df_features_test_avg, columns=df_features_filled_avg.columns)


df_features_train_dropped = pd.DataFrame(df_features_train_dropped, columns=df_features_dropped_nulls.columns)
df_features_test_dropped = pd.DataFrame(df_features_test_dropped, columns=df_features_dropped_nulls.columns)


print("Avg Features:")
display(df_features_train_avg.describe().T)
display(df_features_test_avg.describe().T)

print("Dropped Nulls Features:")
display(df_features_train_dropped.describe().T)
display(df_features_test_dropped.describe().T)

## Task 1: Preprocessing : Implement Decision Tree, k-Nearest Neighbors (kNN) and naïve Bayes

>#### Evaluate accuracy, precision, and recall 

In [None]:

def evaluateModels(target, predictions):
    # get the percentage 
    accuracy = accuracy_score(target, predictions) * 100
    precision = precision_score(target, predictions) * 100
    recall = recall_score(target, predictions) * 100


    print(f"Accuracy: {accuracy:.2f}%", f"Precision: {precision:.2f}%", f"Recall: {recall:.2f}%")
    return accuracy, precision, recall


>#### KNN with scikit-learn

In [None]:

# KNN with scikit-learn using 5 Neighbors and brute force
knnModel = KNeighborsClassifier(n_neighbors=5, algorithm='brute')

# using technique of replacing the nulls values with the mean
knnModel.fit(df_features_train_avg, df_targets_train_avg)

knnPredictions = knnModel.predict(df_features_test_avg)
knn_accuracy_avg, knn_precision_avg, knn_recall_avg = evaluateModels(df_targets_test_avg, knnPredictions)


# using technique of dropping the nulls

knnModel.fit(df_features_train_dropped, df_targets_train_dropped)
knnPredictions = knnModel.predict(df_features_test_dropped)
knn_accuracy_dropped, knn_precision_dropped, knn_recall_dropped = evaluateModels(df_targets_test_dropped, knnPredictions)


>#### KNN from scratch

In [None]:

# get the distances between two points
def eculidean_distance(p, q):
    distance = 0
    for i in range(len(q)):
       distance += ( (p[i] - q[i] ) ** 2 )

    return np.sqrt(distance)


> #### find the neighbors of a point

In [None]:

# find the neighbors of a point (x_test)
# loop over the x_train to find the neighbors
def find_neighbours(x_train, x_test, y_train):
    n = len(x_train)
    distances = np.zeros(n)
    
    for i in range(n):
        distances[i] = eculidean_distance(x_train[i], x_test)


    # convert distances and y_train to data frame to can concatenate
    distances = pd.DataFrame(distances, columns=['Distance'])
    y_train = pd.DataFrame(y_train, columns=['Target'])
    neighbours = pd.concat([distances,y_train], axis=1)

    # sort the neighbors according to the distances
    neighbours = neighbours.sort_values(by='Distance', ascending=True).reset_index(drop=True)

    return neighbours


##### get y predict for a one x test

In [None]:

# take the neighbors and k 
# Return the value with the highest count
def get_y_predict(neighbours, k):
    # get first k rows
    top_k = neighbours.head(k)

    # count the number of 0s and 1s
    label_counts = top_k['Target'].value_counts()

    # return the value with the highest count
    return label_counts.idxmax()

##### get y predict for the test data

In [None]:

# return y predictions for the whole test set
def predict(x_train, x_test, y_train, k):
    y_predictions = np.zeros(len(x_test))
    x_test = x_test.to_numpy()

    # loop over the x_test
    for i in range(len(x_test)):

        # get the neighnours
        neighbours = find_neighbours(x_train.to_numpy(), x_test[i], y_train)

        # get the y prediction and update the list of predictions
        y_predictions[i] = get_y_predict(neighbours, k)

    return y_predictions


> #### different k values for KNN algorithm

In [None]:
# KNN from Scratch with k = 3
knn_scratch_predictions = predict(df_features_train_avg, df_features_test_avg, df_targets_train_avg, 3)
print("KNN FROM SCRATCH WITH K = 3")
knn3_scratch_accuracy, knn3_scratch_precision, knn3_scratch_recall = evaluateModels(df_targets_test_avg, knn_scratch_predictions)

# KNN from Scratch with k = 5
knn_scratch_predictions = predict(df_features_train_avg, df_features_test_avg, df_targets_train_avg, 5)
print("KNN FROM SCRATCH WITH K = 5")
knn5_scratch_accuracy, knn5_scratch_precision, knn5_scratch_recall = evaluateModels(df_targets_test_avg, knn_scratch_predictions)

# KNN from Scratch with k = 7
knn_scratch_predictions = predict(df_features_train_avg, df_features_test_avg, df_targets_train_avg, 7)
print("KNN FROM SCRATCH WITH K = 7")
knn7_scratch_accuracy, knn7_scratch_precision, knn7_scratch_recall = evaluateModels(df_targets_test_avg, knn_scratch_predictions)

# KNN from Scratch with k = 9
knn_scratch_predictions = predict(df_features_train_avg, df_features_test_avg, df_targets_train_avg, 9)
print("KNN FROM SCRATCH WITH K = 9")
knn9_scratch_accuracy, knn9_scratch_precision, knn9_scratch_recall = evaluateModels(df_targets_test_avg, knn_scratch_predictions)

# KNN from Scratch with k = 11
knn_scratch_predictions = predict(df_features_train_avg, df_features_test_avg, df_targets_train_avg, 11)
print("KNN FROM SCRATCH WITH K = 11")
knn11_scratch_accuracy, knn11_scratch_precision, knn11_scratch_recall = evaluateModels(df_targets_test_avg, knn_scratch_predictions)
