First, we are going to load the dataset and perform the EDA on it. Here, I choose the column_3C_weka.csv dataset for analysis.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
df = pd.read_csv('../input/column_3C_weka.csv')

In [4]:
df.head(5)

In [5]:
df.info()

In [6]:
df.describe()

In [7]:
df.columns.values

In [8]:
df['class'].unique()

In [9]:
df['class'] = df['class'].map({'Hernia':1,
                              'Spondylolisthesis':2, 
                              'Normal':3})

In [10]:
sns.pairplot(df, hue='class')

In [13]:
var_names = df.columns.values[:-1]
plt.figure(figsize=(20,10))

for i in range(0,len(var_names)):    
    plt.subplot(2,3,i+1)
    sns.boxplot(x='class',y=var_names[i],data=df)

We are going to use Support Vector Machines, K Nearest Neighbors, Decision Trees, and Random Forrests to build models for this dataset.

In [14]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.cross_validation import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [15]:
x = df.drop(['class'], axis=1)
y = df['class']

SVM model

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x,y,
                                                   test_size=0.30,random_state=101)

In [17]:
model_svc = SVC()
model_svc.fit(x_train, y_train)
svc_predictions = model_svc.predict(x_test)

In [18]:
print(confusion_matrix(y_test, svc_predictions))

In [19]:
print(classification_report(y_test, svc_predictions))

In [20]:
print(accuracy_score(y_test, svc_predictions))

The result shows that everything has been classified into class 2. We have to adjust the parameters.

In [21]:
param_grid = {'C':[0.1,1,10,100,1000], 'gamma':[1,0.1,0.01,0.001,0.0001], 
             'kernel':['rbf']}

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

In [24]:
grid.fit(x_train, y_train)

In [25]:
grid.best_params_

In [26]:
grid.best_estimator_

In [27]:
grid_predictions = grid.predict(x_test)

In [28]:
print(confusion_matrix(y_test, grid_predictions))

In [29]:
print(classification_report(y_test, grid_predictions))

In [30]:
print(accuracy_score(y_test, grid_predictions))

Now we are going to use 10-fold cross validation to test the model. Because all observations have been classified into 3 unbalanced groups, it is better to make stratified folds.

In [31]:
stratif = StratifiedKFold(y,n_folds=10)

In [32]:
scores_svc = cross_val_score(SVC(C=10, gamma=0.0001), x, y, cv=stratif)

In [33]:
scores_svc

In [34]:
print(scores_svc.mean())
print(scores_svc.std())

K-Nearest Neighbors

For KNN, the data needs to be standardized.

In [35]:
from sklearn.preprocessing import StandardScaler

In [36]:
scaler = StandardScaler()

In [37]:
scaled_features = scaler.fit_transform(df.drop('class', axis=1))

In [38]:
df_feat = pd.DataFrame(scaled_features, columns=df.columns[:-1])
df_feat.head(3)

Now the features in the data has been standardized. We then perform the model fitting. Here we set the number of neighbors to 1. We would like to check out the model accuracy. If the accuracy is low, we would come back and adjust this parameter.

In [39]:
x_knn_train, x_knn_test, y_knn_train, y_knn_test = train_test_split(scaled_features, y, 
                                                                   test_size=0.30, random_state=101)

In [41]:
model_knn = KNeighborsClassifier(n_neighbors=1)
model_knn.fit(x_knn_train, y_knn_train)
knn_predictions = model_knn.predict(x_knn_test)

In [42]:
print(confusion_matrix(y_knn_test,knn_predictions))

In [43]:
print(classification_report(y_knn_test,knn_predictions))

The result seems to be acceptable, but we are going to investigate the effect of the number of neighbors on the model accuracy.

In [44]:
error_rate = []

for i in range(1,75):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_knn_train,y_knn_train)
    pred_i = knn.predict(x_knn_test)
    error_rate.append(np.mean(pred_i != y_knn_test))

In [46]:
min(error_rate)

In [47]:
for i in range(0, len(error_rate)):
    if error_rate[i] == min(error_rate):
        print('The number of neighbors that gives the lowest error rate is:',i+1)
    else:
        i += 1

In [48]:
plt.figure(figsize=(10,6))
plt.plot(range(1,75),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

Now we are going to refit the model with n_neighbors = 30.

In [49]:
model_knn_refit = KNeighborsClassifier(n_neighbors=30)
model_knn_refit.fit(x_knn_train, y_knn_train)
knn_predictions_refit = model_knn_refit.predict(x_knn_test)

In [50]:
print(confusion_matrix(y_knn_test,knn_predictions_refit))

In [51]:
print(classification_report(y_knn_test,knn_predictions_refit))

In [52]:
print(accuracy_score(y_knn_test, knn_predictions_refit))

We can see that the model accuracy has been imporved after setting n_neighbors to 30.

In [53]:
scores_knn = cross_val_score(KNeighborsClassifier(n_neighbors=30), scaled_features, y, cv=stratif)

In [54]:
print(scores_knn.mean())
print(scores_knn.std())

Decision Trees

In [55]:
x_dt_train, x_dt_test, y_dt_train, y_dt_test = train_test_split(x, y, 
                                                               test_size=0.30)

In [56]:
model_dt = DecisionTreeClassifier()

In [57]:
model_dt.fit(x_dt_train, y_dt_train)

In [58]:
dt_predictions = model_dt.predict(x_dt_test)

In [59]:
print(confusion_matrix(y_dt_test, dt_predictions))

In [60]:
print(classification_report(y_dt_test, dt_predictions))

In [61]:
print(accuracy_score(y_dt_test, dt_predictions))

In [62]:
scores_dt = cross_val_score(DecisionTreeClassifier(), x, y, cv=stratif)

In [63]:
print(scores_dt.mean())
print(scores_dt.std())

Random forrest

In [64]:
x_rf_train, x_rf_test, y_rf_train, y_rf_test = train_test_split(x, y, 
                                                               test_size=0.30)

In [65]:
model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(x_rf_train, y_rf_train)

In [66]:
rf_predictions = model_rf.predict(x_rf_test)

In [67]:
print(confusion_matrix(y_rf_test, rf_predictions))

In [68]:
print(classification_report(y_rf_test, rf_predictions))

In [69]:
print(accuracy_score(y_rf_test, rf_predictions))

In [70]:
scores_rf = cross_val_score(RandomForestClassifier(n_estimators=100), x, y, cv=stratif)

In [71]:
print(scores_rf.mean())
print(scores_rf.std())

Now we are going to summarize our analysis results by making a table of the 10-fold cross validation test.

In [72]:
list_name = ['SVM', 'KNN', 'DT', 'RF']
list_mean = [scores_svc.mean(), scores_knn.mean(), 
            scores_dt.mean(), scores_rf.mean()]
list_std = [scores_svc.std(), scores_knn.std(), 
            scores_dt.std(), scores_rf.std()]

In [73]:
cross_validation_results = pd.DataFrame({
    'Model':list_name,
    'Mean Scores':list_mean,
    'Standard Deviation':list_std
})

In [74]:
column_names = ['Model', 'Mean Scores', 'Standard Deviation']
cross_validation_results.reindex(columns=column_names)

We can see that the SVM seems to be the best model since it has the highest score. The decision tree model is the worst model. This could be implied by the boxplot in line 11, as there is no obvious differences in every feature among the three classes.