In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import seaborn as sns

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
cd /kaggle/input/disease-symptom-description-dataset

**Read the dataset**

In [None]:
df = pd.read_csv('dataset.csv')
df.head()

**Dataset characteristics**

In [None]:
df.describe()

**Check for null and NaN values**

In [None]:
df.isna().sum()

In [None]:
df.isnull().sum()

**Remove the trailing space from the symptom columns**

In [None]:
cols = df.columns
data = df[cols].values.flatten()

s = pd.Series(data)
s = s.str.strip()
s = s.values.reshape(df.shape)

df = pd.DataFrame(s, columns=df.columns)
df.head()

**Fill the NaN values with zero**

In [None]:
df = df.fillna(0)
df.head()

**Symptom severity rank**

In [None]:
df1 = pd.read_csv('Symptom-severity.csv')
df1.head()

**Get overall list of symptoms**

In [None]:
df1['Symptom'].unique()

**Encode symptoms in the data with the symptom rank**

In [None]:
vals = df.values
symptoms = df1['Symptom'].unique()

for i in range(len(symptoms)):
    vals[vals == symptoms[i]] = df1[df1['Symptom'] == symptoms[i]]['weight'].values[0]
    
d = pd.DataFrame(vals, columns=cols)
d.head()

**Assign symptoms with no rank to zero**

In [None]:
d = d.replace('dischromic _patches', 0)
d = d.replace('spotting_ urination',0)
df = d.replace('foul_smell_of urine',0)
df.head()

**Check if entire columns have zero values so we can drop those values**

In [None]:
(df[cols] == 0).all()

In [None]:
df['Disease'].value_counts()

**Get the names of diseases from data**

In [None]:
df['Disease'].unique()

**Select the features as symptoms column and label as Disease column**

In [None]:
data = df.iloc[:,1:].values
labels = df['Disease'].values

**Split the data, labels with shuffle**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, shuffle=True, train_size = 0.85)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

**Initialize and train a Support vector classifier**

In [None]:
model = SVC()
model.fit(x_train, y_train)

**Throw predictions**

In [None]:
preds = model.predict(x_test)

**Plot the confusion matrix for 25 diseases and calculate f1, accuracy**

In [None]:
conf_mat = confusion_matrix(y_test, preds)
df_cm = pd.DataFrame(conf_mat, index=df['Disease'].unique(), columns=df['Disease'].unique())
print('F1-score% =', f1_score(y_test, preds, average='macro')*100, '|', 'Accuracy% =', accuracy_score(y_test, preds)*100)
sns.heatmap(df_cm)