In [None]:
import pandas as pd


df = pd.read_csv('COVID_FILE.csv')

df = df.drop(['source', 'province','longitude','latitude','days_onset_to_admission','symptoms','geo_resolution','city','country','date_admission_hospital','date_death_or_discharge'], axis=1)
df.head(9)

In [None]:
df['date_onset_symptoms'] = pd.to_datetime(df['date_onset_symptoms'])
df['date_confirmation'] = pd.to_datetime(df['date_confirmation'])
df['date_difference'] = df['date_confirmation'] - df['date_onset_symptoms']
df['date_difference_in_days'] = df['date_difference'].dt.days
mode=df['date_difference_in_days'].mode()[0]
df['date_difference_in_days'] = df['date_difference_in_days'].fillna(mode).astype(int)
df[['date_onset_symptoms', 'date_confirmation', 'date_difference_in_days']].head()
df=df.drop('date_difference',axis=1)

In [None]:
df

In [None]:
print(df['urgency_of_admission'].value_counts())

In [None]:
df.isnull().sum()

In [None]:
mean = int(df['age'].mean())
# df['age'].fillna(mean, inplace=True)
df.fillna({'age': mean},inplace = True)
mode = df['sex'].mode()[0]
# df['sex'].fillna(mode, inplace=True)
df.fillna({'sex': mode},inplace = True)
df['age'] = df['age'].fillna(0).astype(int)

In [None]:
df.isna().sum()

## **MODEL EXECUTION STARTS**

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()


columns = ['cough', 'fever', 'chills', 'sore_throat', 'headache', 'fatigue','sex','urgency_of_admission']

for c in columns:
    df[c] = label_encoder.fit_transform(df[c])
df.head(8)


In [None]:
df[['cough', 'fever', 'chills', 'sore_throat', 'headache', 'fatigue','sex','urgency_of_admission']
].sum()


In [None]:
from matplotlib import pyplot as plt
df['age_group'] = pd.cut(df['age'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                       labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])

df['age_group'] = pd.Categorical(df['age_group'], categories=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'], ordered=True)

df['age_group'].value_counts().sort_index().plot(kind='bar', title='age')
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.xlabel('Age Group')
plt.ylabel('Count')
plt.title('Age Distribution')
df.drop('age_group', axis=1, inplace=True)

In [None]:
import seaborn as sns
symptoms = ['cough', 'fever', 'chills', 'sore_throat', 'headache', 'fatigue']

for symptom in symptoms:
  sns.boxplot(x=df[symptom], y=df['age'])
  plt.title(f'Box plot of Age vs {symptom}')
  plt.show()

In [None]:
print(df.columns)
df = df.drop('ID', axis=1)
df = df.drop('date_confirmation',axis =1)
df = df.drop('date_onset_symptoms',axis =1)

In [None]:
# Calculate the correlation matrix
correlation_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
#Correlation wrt target column
correlation_matrix = df.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix[['urgency_of_admission']], annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation with Urgency of Admission')
plt.show()

In [None]:
#Applying Logistic Regression Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

X = df.drop('urgency_of_admission', axis=1)
y = df['urgency_of_admission']

for col in X.columns:
    if X[col].dtype == 'datetime64[ns]':
        X[col] = pd.to_numeric(X[col])

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
y_pred_proba = model.predict_proba(X_test)[:,1]

print("Confidence Scores:")
for i in range(len(y_pred)):
  print(f"Prediction: {y_pred[i]}, Confidence: {y_pred_proba[i]:.2f}")

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
#Applying Random Forest Model
from sklearn.ensemble import RandomForestClassifier

X = df.drop('urgency_of_admission', axis=1)
y = df['urgency_of_admission']

for col in X.columns:
  if X[col].dtype == 'datetime64[ns]':
    X[col] = pd.to_numeric(X[col])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


model = RandomForestClassifier(n_estimators=185, random_state=42)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
#Applying feature importance in Random Forest

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

feature_importances = rf.feature_importances_

importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(importance_df)


In [None]:

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
y_pred_proba = model.predict_proba(X_test)

print("Confidence Scores:")
for i in range(len(y_pred)):
  print(f"Prediction: {y_pred[i]}, Confidence: {max(y_pred_proba[i]):.2f}")

In [None]:
#Applying KNN Model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.38, random_state=42)

for col in X.columns:
  if X[col].dtype == 'datetime64[ns]':
    X[col] = pd.to_numeric(X[col])

knn_model = KNeighborsClassifier(n_neighbors=25)
knn_model.fit(X_train, y_train)


y_pred = knn_model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print("KNN Accuracy:", accuracy)


In [None]:
#Finding feature importance in KNN model
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(knn_model, X_test, y_test)
feature_importances = perm_importance.importances_mean

for i, importance in enumerate(feature_importances):
    print(f"Feature {X_train.columns[i]}: {importance}")


In [None]:
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("KNN Precision:", precision)
print("KNN Recall:", recall)
print("KNN F1 Score:", f1)

In [None]:

y_pred_proba = knn_model.predict_proba(X_test)

print("KNN Confidence Scores:")
for i in range(len(y_pred)):
  print(f"Prediction: {y_pred[i]}, Confidence: {max(y_pred_proba[i]):.2f}")