<a href="https://colab.research.google.com/github/sadrabr/machin-learning-projects/blob/main/fetal_health.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from ast import increment_lineno
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns # type: ignore
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('medical_conditions_dataset.csv')
df.head()

Unnamed: 0,id,full_name,age,gender,smoking_status,bmi,blood_pressure,glucose_levels,condition
0,1,User0001,,male,Non-Smoker,,,,Pneumonia
1,2,User0002,30.0,male,Non-Smoker,,105.315064,,Diabetic
2,3,User0003,18.0,male,Non-Smoker,35.612486,,,Pneumonia
3,4,User0004,,male,Non-Smoker,,99.119829,,Pneumonia
4,5,User0005,76.0,male,Non-Smoker,,,,Diabetic


In [None]:
df.nunique()

id                10000
full_name         10000
age                  72
gender                2
smoking_status        2
bmi                4652
blood_pressure     3766
glucose_levels     4756
condition             3
dtype: int64

In [None]:
df.isnull().sum()

id                   0
full_name            0
age               4555
gender               0
smoking_status       0
bmi               5348
blood_pressure    6234
glucose_levels    5244
condition            0
dtype: int64

In [None]:
df.drop_duplicates().sum()

id                                                         50005000
full_name         User0001User0002User0003User0004User0005User00...
age                                                        291534.0
gender            malemalemalemalemalemalemalemalemalemalefemale...
smoking_status    Non-SmokerNon-SmokerNon-SmokerNon-SmokerNon-Sm...
bmi                                                   127573.749242
blood_pressure                                         509198.70846
glucose_levels                                        643104.457728
condition         PneumoniaDiabeticPneumoniaPneumoniaDiabeticDia...
dtype: object

In [None]:
df.drop_duplicates()

Unnamed: 0,id,full_name,age,gender,smoking_status,bmi,blood_pressure,glucose_levels,condition
0,1,User0001,,male,Non-Smoker,,,,Pneumonia
1,2,User0002,30.0,male,Non-Smoker,,105.315064,,Diabetic
2,3,User0003,18.0,male,Non-Smoker,35.612486,,,Pneumonia
3,4,User0004,,male,Non-Smoker,,99.119829,,Pneumonia
4,5,User0005,76.0,male,Non-Smoker,,,,Diabetic
...,...,...,...,...,...,...,...,...,...
9995,9996,User9996,,male,Non-Smoker,25.029002,152.540355,137.551451,Pneumonia
9996,9997,User9997,,male,Non-Smoker,27.017487,,,Diabetic
9997,9998,User9998,23.0,male,Smoker,,148.833321,173.931480,Pneumonia
9998,9999,User9999,,female,Non-Smoker,,,,Pneumonia


In [None]:
df.dtypes

id                  int64
full_name          object
age               float64
gender             object
smoking_status     object
bmi               float64
blood_pressure    float64
glucose_levels    float64
condition          object
dtype: object

In [None]:
df = df.drop(['id','full_name'],axis=1)

In [None]:
df_numeric_features = df.select_dtypes(include=['number'])
print(f"Numeric Features :\n{df_numeric_features}\n")

df_object_features= df.select_dtypes(include=['object'])
print(f"Object Features :\n{df_object_features}\n")

Numeric Features :
       age        bmi  blood_pressure  glucose_levels
0      NaN        NaN             NaN             NaN
1     30.0        NaN      105.315064             NaN
2     18.0  35.612486             NaN             NaN
3      NaN        NaN       99.119829             NaN
4     76.0        NaN             NaN             NaN
...    ...        ...             ...             ...
9995   NaN  25.029002      152.540355      137.551451
9996   NaN  27.017487             NaN             NaN
9997  23.0        NaN      148.833321      173.931480
9998   NaN        NaN             NaN             NaN
9999  27.0  25.454891             NaN      196.083267

[10000 rows x 4 columns]

Object Features :
      gender smoking_status  condition
0       male     Non-Smoker  Pneumonia
1       male     Non-Smoker   Diabetic
2       male     Non-Smoker  Pneumonia
3       male     Non-Smoker  Pneumonia
4       male     Non-Smoker   Diabetic
...      ...            ...        ...
9995    male   

In [None]:
from sklearn.impute import SimpleImputer

imputer2 = SimpleImputer(strategy='mean')
df_numeric_features = pd.DataFrame(imputer2.fit_transform(df_numeric_features), columns=df_numeric_features.columns)

imputer1 = SimpleImputer(strategy='most_frequent')
df_object_features = pd.DataFrame(imputer1.fit_transform(df_object_features), columns=df_object_features.columns)
df_object_features.isnull().sum()
df_numeric_features.isnull().sum()

age               0
bmi               0
blood_pressure    0
glucose_levels    0
dtype: int64

In [None]:
def find_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return column[(column < lower_bound) | ( upper_bound < column)]

for i, column in enumerate(df_numeric_features.columns):
    outliers = find_outliers(df_numeric_features[column])
    if(outliers.size>0):
      print(f"outliers - {column} :\n{outliers}")

outliers - age :
1       30.0
2       18.0
4       76.0
5       40.0
13      72.0
        ... 
9990    83.0
9992    85.0
9994    34.0
9997    23.0
9999    27.0
Name: age, Length: 3577, dtype: float64
outliers - bmi :
2       35.612486
5       33.840723
8       39.649679
11      34.441275
12      20.698672
          ...    
9990    26.125438
9991    31.785857
9995    25.029002
9996    27.017487
9999    25.454891
Name: bmi, Length: 4652, dtype: float64
outliers - blood_pressure :
1       105.315064
3        99.119829
7       115.826322
13      165.969984
14      117.464790
           ...    
9988    132.998849
9992     98.812356
9993    127.165293
9995    152.540355
9997    148.833321
Name: blood_pressure, Length: 3766, dtype: float64
outliers - glucose_levels :
6       153.151126
7       199.339699
10      187.633751
11      158.375034
17      135.330911
           ...    
9990    185.317924
9994    181.152892
9995    137.551451
9997    173.931480
9999    196.083267
Name: glucose_levels

In [None]:
# import scipy.stats as stats

# df_numeric_features['zscore_bmi'] = stats.zscore(df_numeric_features['bmi'])
# df_numeric_features['zscore_age'] = stats.zscore(df_numeric_features['age'])
# df_numeric_features.head()

In [None]:
# df_numeric_features.reset_index(drop=True, inplace=True)
# df_object_features.reset_index(drop=True, inplace=True)

print("null num", df_object_features.isnull().sum())
df = pd.concat([df_object_features,df_numeric_features], axis=1)

null num gender            0
smoking_status    0
condition         0
dtype: int64


In [None]:
# df=df[(-3< df['zscore_age']) & (df['zscore_age']<3) & (-3< df['zscore_bmi']) & (df['zscore_bmi']<3)]
# df.head()

In [None]:
df['condition'].value_counts().to_frame()

Unnamed: 0_level_0,count
condition,Unnamed: 1_level_1
Diabetic,6013
Pneumonia,2527
Cancer,1460


In [None]:
# df = df.drop(['zscore_age','zscore_bmi'],axis=1)

In [None]:
import category_encoders as ce
oe = ce.OrdinalEncoder()
df = oe.fit_transform(df)
df

Unnamed: 0,gender,smoking_status,condition,age,bmi,blood_pressure,glucose_levels
0,1,1,1,53.541598,27.423420,135.209429,135.219608
1,1,1,2,30.000000,27.423420,105.315064,135.219608
2,1,1,1,18.000000,35.612486,135.209429,135.219608
3,1,1,1,53.541598,27.423420,99.119829,135.219608
4,1,1,2,76.000000,27.423420,135.209429,135.219608
...,...,...,...,...,...,...,...
9995,1,1,1,53.541598,25.029002,152.540355,137.551451
9996,1,1,2,53.541598,27.017487,135.209429,135.219608
9997,1,2,1,23.000000,27.423420,148.833321,173.931480
9998,2,1,1,53.541598,27.423420,135.209429,135.219608


In [None]:
from sklearn.model_selection import train_test_split

x = df.drop('condition',axis=1)
y = df['condition']

x_train,x_test,y_train,y_test =train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score,classification_report
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

print("confusion_matrix:\n", confusion_matrix(y_test, y_pred))
print("accuracy_score:", accuracy_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred,average='micro'))
print("precision_score:", precision_score(y_test, y_pred,average='micro'))
print(f"\nClassification Report:\n{classification_report(y_test, y_pred )}")

confusion_matrix:
 [[140 319  26]
 [378 778  61]
 [ 96 190  12]]
accuracy_score: 0.465
recall_score: 0.465
precision_score: 0.465

Classification Report:
              precision    recall  f1-score   support

           1       0.23      0.29      0.25       485
           2       0.60      0.64      0.62      1217
           3       0.12      0.04      0.06       298

    accuracy                           0.47      2000
   macro avg       0.32      0.32      0.31      2000
weighted avg       0.44      0.47      0.45      2000



In [None]:
from sklearn.svm import SVC
svc=SVC(C=100)
svc.fit(x_train,y_train)
y_pred=svc.predict(x_test)
print("confusion_matrix:\n", confusion_matrix(y_test, y_pred))
print("accuracy_score:", accuracy_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred,average='micro'))
print("precision_score:", precision_score(y_test, y_pred,average='micro'))
print(f"\nClassification Report:\n{classification_report(y_test, y_pred )}")

confusion_matrix:
 [[  11  472    2]
 [   9 1202    6]
 [   3  293    2]]
accuracy_score: 0.6075
recall_score: 0.6075
precision_score: 0.6075

Classification Report:
              precision    recall  f1-score   support

           1       0.48      0.02      0.04       485
           2       0.61      0.99      0.76      1217
           3       0.20      0.01      0.01       298

    accuracy                           0.61      2000
   macro avg       0.43      0.34      0.27      2000
weighted avg       0.52      0.61      0.47      2000



In [None]:
linear_svc1000=SVC(C=100,kernel='linear')
linear_svc1000.fit(x_train, y_train)
y_pred=linear_svc1000.predict(x_test)
print("confusion_matrix:\n", confusion_matrix(y_test, y_pred))
print("accuracy_score:", accuracy_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred,average='micro'))
print("precision_score:", precision_score(y_test, y_pred,average='micro'))
print(f"\nClassification Report:\n{classification_report(y_test, y_pred )}")
