## Implementation of KNN on Covid dataset

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import time
import pandas as pd

Loading dataset from CSV

In [2]:
def load_dataset_from_csv(file_path):
    data = pd.read_csv(file_path)
    X = data.iloc[:, :-1].values  # all columns except the last one as features
    y = data.iloc[:, -1].values   # the last column as labels
    return X, y

Train and time KNN (without Ray)

In [3]:
def train_and_time_knn(X_train, y_train, X_test, y_test, n_neighbors=5):
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    
    start_time = time.time()
    knn.fit(X_train, y_train)  # Train KNN model
    y_pred = knn.predict(X_test)  # Predict on test data
    end_time = time.time()

    accuracy = accuracy_score(y_test, y_pred)
    time_taken = end_time - start_time

    return accuracy, time_taken

Load dataset from a CSV file

In [4]:
X, y = load_dataset_from_csv('usa_county_wise.csv')  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step-by-Step Preprocessing
1. Load and Inspect Data

In [5]:
import pandas as pd

data = pd.read_csv('usa_county_wise.csv')
print(data.head())  # View the first few rows
print(data.info())  # Check column types and missing values
print(data.describe())  # Get summary statistics for numeric columns


        UID iso2 iso3  code3     FIPS    Admin2            Province_State   
0        16   AS  ASM     16     60.0       NaN            American Samoa  \
1       316   GU  GUM    316     66.0       NaN                      Guam   
2       580   MP  MNP    580     69.0       NaN  Northern Mariana Islands   
3  63072001   PR  PRI    630  72001.0  Adjuntas               Puerto Rico   
4  63072003   PR  PRI    630  72003.0    Aguada               Puerto Rico   

  Country_Region        Lat       Long_                  Combined_Key   
0             US -14.271000 -170.132000            American Samoa, US  \
1             US  13.444300  144.793700                      Guam, US   
2             US  15.097900  145.673900  Northern Mariana Islands, US   
3             US  18.180117  -66.754367     Adjuntas, Puerto Rico, US   
4             US  18.360255  -67.175131       Aguada, Puerto Rico, US   

      Date  Confirmed  Deaths  
0  1/22/20          0       0  
1  1/22/20          0       0  
2 

2. Handle Missing Values

In [6]:
# Check for missing values
print(data.isnull().sum())

# Drop rows with excessive missing values (e.g., Admin2, Province_State)
data.dropna(subset=['Admin2', 'Province_State'], inplace=True)

# Fill missing numerical values
data.fillna(0, inplace=True)


UID                  0
iso2                 0
iso3                 0
code3                0
FIPS              1880
Admin2            1128
Province_State       0
Country_Region       0
Lat                  0
Long_                0
Combined_Key         0
Date                 0
Confirmed            0
Deaths               0
dtype: int64


3. Drop Irrelevant Columns

In [7]:
data = data.drop(columns=['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Combined_Key', 'Country_Region'])

4. Handle Date Column

In [8]:
# Convert to datetime
data['Date'] = pd.to_datetime(data['Date'])

# Extract useful date features (optional)
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

# Drop the original 'Date' column if not needed
data = data.drop(columns=['Date'])


  data['Date'] = pd.to_datetime(data['Date'])


5. Encode Categorical Columns

In [9]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
data['Admin2'] = encoder.fit_transform(data['Admin2'])
data['Province_State'] = encoder.fit_transform(data['Province_State'])


6. Separate Features and Target

In [11]:
X = data.drop(columns=['Confirmed', 'Deaths'])  # Drop target columns
y = data['Confirmed']  # Use 'Deaths' if modeling deaths

7. Split the Dataset

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

8. Scale Numeric Features

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Final Preprocessing Code

In [14]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load dataset
data = pd.read_csv('usa_county_wise.csv')

# Handle missing values
data.dropna(subset=['Admin2', 'Province_State'], inplace=True)
data.fillna(0, inplace=True)

# Drop irrelevant columns
data = data.drop(columns=['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Combined_Key', 'Country_Region'])

# Handle 'Date' column
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data = data.drop(columns=['Date'])

# Encode categorical columns
encoder = LabelEncoder()
data['Admin2'] = encoder.fit_transform(data['Admin2'])
data['Province_State'] = encoder.fit_transform(data['Province_State'])

# Separate features and target
X = data.drop(columns=['Confirmed', 'Deaths'])
y = data['Confirmed']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


  data['Date'] = pd.to_datetime(data['Date'])


In [15]:
# Train and evaluate KNN
accuracy, time_taken = train_and_time_knn(X_train, y_train, X_test, y_test)
print(f"KNN (without Ray) -> Accuracy: {accuracy * 100:.2f}%, Time taken: {time_taken:.4f} seconds")

KNN (without Ray) -> Accuracy: 51.47%, Time taken: 9.3368 seconds
