In [1]:
!wget "https://raw.githubusercontent.com/pradeep-016/ML_Course/refs/heads/main/2)%20Classification%20Models/Support%20Vector%20Machines%20(SVM)/cerv_cancer.json"

--2024-11-29 07:58:39--  https://raw.githubusercontent.com/pradeep-016/ML_Course/refs/heads/main/2)%20Classification%20Models/Support%20Vector%20Machines%20(SVM)/cerv_cancer.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 979901 (957K) [text/plain]
Saving to: ‘cerv_cancer.json’


2024-11-29 07:58:40 (16.3 MB/s) - ‘cerv_cancer.json’ saved [979901/979901]



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbalanced_Pipeline
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

In [3]:
data = pd.read_json('/content/cerv_cancer.json')

In [4]:
data.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs,Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4,15,1,0,0,{'year)': '0'},0,0,0,...,?,?,0,0,0,0,0,0,0,0
1,15,1,14,1,0,0,{'year)': '0'},0,0,0,...,?,?,0,0,0,0,0,0,0,0
2,34,1,?,1,0,0,{'year)': '0'},0,0,0,...,?,?,0,0,0,0,0,0,0,0
3,52,5,16,4,1,37,{'year)': '37'},1,3,0,...,?,?,1,0,1,0,0,0,0,0
4,46,3,21,4,0,0,{'year)': '0'},1,15,0,...,?,?,0,0,0,0,0,0,0,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 968 entries, 0 to 967
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Age                                 968 non-null    int64 
 1   Number of sexual partners           968 non-null    object
 2   First sexual intercourse            968 non-null    object
 3   Num of pregnancies                  968 non-null    object
 4   Smokes                              968 non-null    object
 5   Smokes (years)                      968 non-null    object
 6   Smokes (packs                       968 non-null    object
 7   Hormonal Contraceptives             968 non-null    object
 8   Hormonal Contraceptives (years)     968 non-null    object
 9   IUD                                 968 non-null    object
 10  IUD (years)                         968 non-null    object
 11  STDs                                968 non-null    object

In [6]:
cols_to_numeric = [
    'Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies',
    'Smokes (years)', 'Hormonal Contraceptives (years)', 'IUD (years)',
    'STDs (number)', 'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'
]

In [7]:
data[cols_to_numeric] = data[cols_to_numeric].apply(pd.to_numeric, errors='coerce')

In [8]:
print(data.isnull().sum())

Age                                     0
Number of sexual partners              29
First sexual intercourse               10
Num of pregnancies                     70
Smokes                                  0
Smokes (years)                         16
Smokes (packs                           0
Hormonal Contraceptives                 0
Hormonal Contraceptives (years)       122
IUD                                     0
IUD (years)                           133
STDs                                    0
STDs (number)                         122
STDs:condylomatosis                     0
STDs:cervical condylomatosis            0
STDs:vaginal condylomatosis             0
STDs:vulvo-perineal condylomatosis      0
STDs:syphilis                           0
STDs:pelvic inflammatory disease        0
STDs:genital herpes                     0
STDs:molluscum contagiosum              0
STDs:AIDS                               0
STDs:HIV                                0
STDs:Hepatitis B                  

In [9]:
def extract_value(val):
    if isinstance(val, dict):
        return val.get('year)', np.nan)
        return val

In [10]:
data['Smokes (packs'] = data['Smokes (packs'].apply(extract_value)

In [11]:
X = data.drop('Dx:Cancer', axis=1)
y = data['Dx:Cancer']

In [12]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [13]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [14]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [16]:
smote = SMOTE(sampling_strategy='auto', random_state=42)

In [17]:
model = imbalanced_Pipeline([
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', SVC(class_weight='balanced'))
])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
model.fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_test)

In [21]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

ac = accuracy_score(y_test, y_pred)
print("Accuracy:", ac)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       189
           1       0.71      1.00      0.83         5

    accuracy                           0.99       194
   macro avg       0.86      0.99      0.91       194
weighted avg       0.99      0.99      0.99       194

Confusion Matrix:
[[187   2]
 [  0   5]]
Accuracy: 0.9896907216494846
