In [1]:
from sklearn import datasets
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.under_sampling import OneSidedSelection
from imblearn.over_sampling import SMOTE

pd.set_option('display.max_columns', None)
plt.rcParams['figure.figsize'] = (16, 8.27)  #set graphs size to A4 dimensions
sns.set_style('darkgrid')
sns.set(font_scale=1.4)

In [2]:
dataset = pd.read_csv('../works/data/adult.csv')
encoder = LabelEncoder()
dataset['income'] = encoder.fit_transform(dataset['income'])
dataset['native-country'] = np.where(dataset['native-country'] == 'United-States', 1, 0)
dataset['marital-status'] = dataset['marital-status'].replace(
    {' Married-civ-spouse': 'Married', ' Never-married': 'Single',
     ' Separated': 'Divorced', ' Married-spouse-absent': 'Divorced',
     ' Divorced': 'Divorced',
     ' Married-AF-spouse': 'Divorced', ' Widowed': 'Widowed'})
dataset['workclass'] = np.where(dataset['workclass'] == 'Private', 1, 0)
dataset['gender'] = np.where(dataset['gender'] == 'Male', 1, 0)
dataset['race'] = np.where(dataset['race'] == 'White', 1, 0)
education_mapping = {'Preschool': 0, '1st-4th': 1, '5th-6th': 2, '7th-8th': 3, '9th': 4, '10th': 5,
                     '11th': 6, '12th': 7, 'HS-grad': 8, 'Some-college': 0, 'Assoc-acdm': 10,
                     'Assoc-voc': 11, 'Bachelors': 12, 'Prof-school': 13, 'Masters': 14, 'Doctorate': 15
                     }
dataset['education'] = dataset['education'].map(education_mapping)
relationship_ordered = dataset.groupby(['relationship'])['income'].count().sort_values().index
relationship_ordered = {k: i for i, k in enumerate(relationship_ordered, 0)}
dataset['relationship'] = dataset['relationship'].map(relationship_ordered)
occupation_ordered = dataset.groupby(['occupation'])['income'].count().sort_values().index
occupation_ordered = {k: i for i, k in enumerate(occupation_ordered, 0)}
dataset['occupation'] = dataset['occupation'].map(occupation_ordered)
marital_ordered = dataset.groupby(['marital-status'])['income'].count().sort_values().index
marital_ordered = {k: i for i, k in enumerate(marital_ordered, 0)}
dataset['marital-status'] = dataset['marital-status'].map(marital_ordered)
dataset.drop('fnlwgt', axis=1, inplace=True)  # it is not a useful feature for predicting the wage class
scaler = StandardScaler()
scaled_features_balanced_dataset = scaler.fit_transform(dataset.drop('income', axis=1))
scaled_features_balanced_dataset = pd.DataFrame(scaled_features_balanced_dataset,
                                                columns=dataset.drop('income', axis=1).columns)
# undersampling the train set
under = OneSidedSelection()
x_train_res, y_train_res = under.fit_resample(scaled_features_balanced_dataset, dataset['income'])

# oversampling the train set
sm = SMOTE()
x_train_res, y_train_res = sm.fit_resample(x_train_res, y_train_res)

x_train_res = pd.DataFrame(x_train_res, columns=dataset.drop('income', axis=1).columns)

# creating the final train
final_balanced_dataset = pd.concat([x_train_res, y_train_res], axis=1)
y = final_balanced_dataset.income
final_balanced_dataset.drop(labels=['income'], axis=1, inplace=True)
x = final_balanced_dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [61]:
# iris = datasets.load_wine()
# X = pd.DataFrame(iris.data, columns=iris.feature_names)
# y = pd.DataFrame(iris.target)
# diabetes = pd.read_csv('../works/data/diabetes.csv')
# y = diabetes.Outcome
# diabetes.drop(labels=['Outcome'], axis=1, inplace=True)
# X = diabetes
# X.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [3]:
y.value_counts()

0    35657
1    35657
Name: income, dtype: int64

In [4]:
x.head(5)

Unnamed: 0,age,workclass,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,0.974183,0.663711,1.052682,1.136512,-1.918314,0.658995,-1.303025,0.411743,-1.42001,-0.144804,-0.217127,-0.034087,-2.957854
1,-0.995129,0.663711,-0.241697,-1.197259,-0.101107,-0.563611,-0.567717,-2.428701,0.70422,-0.144804,-0.217127,-0.034087,0.338083
2,-0.046942,0.663711,0.189763,-0.419335,0.807497,-1.786217,0.902898,0.411743,0.70422,-0.144804,-0.217127,0.77293,0.338083
3,-1.505691,-1.50668,-1.536076,-0.030373,-0.101107,-0.869262,-0.567717,0.411743,-1.42001,-0.144804,-0.217127,-0.841104,0.338083
4,-0.338691,0.663711,-0.457427,-1.586221,-0.101107,-0.25796,0.16759,0.411743,0.70422,-0.144804,-0.217127,-0.841104,0.338083


In [64]:
scaler = StandardScaler()
X_tmp = x.copy()
X = pd.DataFrame(scaler.fit_transform(X_tmp), columns=x.columns)
X.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.848324,0.149641,0.90727,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.23388,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.90727,0.765836,1.409746,5.484909,-0.020496


In [65]:
sm = SMOTE()
X_res, y_res = sm.fit_resample(X, y)
y_res.value_counts()

1    500
0    500
Name: Outcome, dtype: int64

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
kernels = ['linear', 'rbf', 'poly']
params = {
    'kernel': 'linear',
    'tol': 0.01,
    'C': 5,
}

In [6]:
model = SVC(**params)

In [8]:
model.fit(x_train, y_train.values.ravel())

KeyboardInterrupt: 

In [70]:
y_pred = model.predict(X_test)

In [71]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.88      0.82       107
           1       0.58      0.38      0.46        47

    accuracy                           0.73       154
   macro avg       0.67      0.63      0.64       154
weighted avg       0.71      0.73      0.71       154

