In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load datasets
train_data = pd.read_csv('Training.csv')
test_data = pd.read_csv('Testing.csv')

# Drop any unnamed columns that may have been added during the CSV save process
train_data = train_data.loc[:, ~train_data.columns.str.contains('^Unnamed')]
test_data = test_data.loc[:, ~test_data.columns.str.contains('^Unnamed')]

# Separate features and labels
X_train = train_data.drop('skin_rash', axis=1)
y_train = train_data['skin_rash']
X_test = test_data.drop('itching', axis=1)
y_test = test_data['itching']

# Ensure that all columns present in train are also in test
missing_cols_in_test = set(X_train.columns) - set(X_test.columns)
for col in missing_cols_in_test:
    X_test[col] = 0

# Ensure the same column order
X_test = X_test[X_train.columns]

# Identify categorical and numerical columns
categorical_columns = X_train.select_dtypes(include=['object']).columns
numerical_columns = X_train.select_dtypes(include=[np.number]).columns

# Define preprocessing for numerical data (impute missing values and scale)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical data (impute missing values and one-hot encode)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Create the SVM pipeline with preprocessing and model training
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel='linear'))
])

# Train the model
svm_pipeline.fit(X_train, y_train)

# Predict on training data
y_train_pred = svm_pipeline.predict(X_train)

# Predict on testing data
y_test_pred = svm_pipeline.predict(X_test)

# Print evaluation metrics
print("Training Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))

print("\nTraining Classification Report:")
print(classification_report(y_train, y_train_pred))

print("\nTraining Accuracy Score:")
print(accuracy_score(y_train, y_train_pred))

print("\nTesting Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

print("\nTesting Classification Report:")
print(classification_report(y_test, y_test_pred))

print("\nTesting Accuracy Score:")
print(accuracy_score(y_test, y_test_pred))


Training Confusion Matrix:
[[4080   54]
 [   0  786]]

Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      4134
           1       0.94      1.00      0.97       786

    accuracy                           0.99      4920
   macro avg       0.97      0.99      0.98      4920
weighted avg       0.99      0.99      0.99      4920


Training Accuracy Score:
0.9890243902439024

Testing Confusion Matrix:
[[31  4]
 [ 3  4]]

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90        35
           1       0.50      0.57      0.53         7

    accuracy                           0.83        42
   macro avg       0.71      0.73      0.72        42
weighted avg       0.84      0.83      0.84        42


Testing Accuracy Score:
0.8333333333333334
