<!-- In machine learning, SVM is used to classify data by finding the optimal decision boundary that maximally separates different classes. It aims to find the best hyperplane that maximizes the margin between support vectors, enabling effective classification even in complex, non-linear scenarios. -->

In [1]:
import numpy as np # linear algebra
import pandas as pd

In [2]:
# import libraries for plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
data = pd.read_csv(r'C:\Users\sisir.sahu\Downloads\Machine Learning\SVC\adult.csv')

In [4]:
data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [5]:
data.shape

(32561, 15)

In [6]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [8]:
data[data == "?" ] = np.nan

In [9]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [10]:
data.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

In [11]:
#replace missing categorical values with mode
for feature in ['workclass', 'occupation', 'native.country']:
    data[feature].fillna(data[feature].mode()[0], inplace = True)

In [12]:
#Split data into x and y    
x = data.iloc[:, :-1]
y = data.iloc[:, -1].values

y = y.reshape(-1, 1)

In [13]:
#define categorical objects
cat_obj = [feature for feature in x.columns if x[feature].dtypes == 'O']

In [14]:
cat_obj

['workclass',
 'education',
 'marital.status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native.country']

In [15]:
#convert categorical to numeric objects
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()

for feature in cat_obj:
    # data[feature] = data[feature].values.reshape(-1,1)
    x[feature] = oe.fit_transform(x[feature].values.reshape(-1,1))

In [16]:
#split x and y into training and test dataset
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [17]:
#Scale the x train and test
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test =  scaler.transform(x_test)

In [18]:
#scale the target variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test  = le.transform(y_test)

  return f(*args, **kwargs)


In [19]:
#import classifier model
from sklearn.svm import SVC
classifier = SVC()

#Apply classifier on x train and y_train
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

In [20]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[4701  275]
 [ 708  829]]


In [21]:
#Accuracy Score or Recall
from sklearn.metrics import accuracy_score
ac = accuracy_score(y_test, y_pred)
ac

0.8490710885920467

In [22]:
#Classification report
from sklearn.metrics import classification_report
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       0.87      0.94      0.91      4976\n           1       0.75      0.54      0.63      1537\n\n    accuracy                           0.85      6513\n   macro avg       0.81      0.74      0.77      6513\nweighted avg       0.84      0.85      0.84      6513\n'

In [23]:
#check for bias and variance

bias = classifier.score(x_train, y_train) 
variance = classifier.score(x_test, y_test)
print(bias, variance)

0.8564573095823096 0.8490710885920467
