<a href="https://colab.research.google.com/github/sathvikrav/LockheedChallengeBoxInPrimeShapeBruteForceSolution/blob/master/KNNandXGBArrhythmia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import numpy as np
import pandas as pd
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale
from sklearn import metrics
from xgboost import XGBClassifier

In [3]:
heart_data = pd.read_csv("arrhythmia.data")

In [4]:
print(heart_data.head())

   75  0  190  80   91  193  371  ...  0.0.39  0.0.40  0.9.3 2.9.1  23.3  49.4   8
0  56  1  165  64   81  174  401  ...     0.0     0.0    0.2   2.1  20.4  38.8   6
1  54  0  172  95  138  163  386  ...     0.0     0.0    0.3   3.4  12.3  49.0  10
2  55  0  175  94  100  202  380  ...     0.0     0.0    0.4   2.6  34.6  61.6   1
3  75  0  190  80   88  181  360  ...     0.0     0.0   -0.1   3.9  25.4  62.8   7
4  13  0  169  51  100  167  321  ...     0.0     0.0    0.9   2.2  13.5  31.1  14

[5 rows x 280 columns]


In [5]:
data = pd.DataFrame(data=heart_data)
print(data)

     75  0  190  80   91  193  ...  0.0.40  0.9.3  2.9.1  23.3  49.4   8
0    56  1  165  64   81  174  ...     0.0    0.2    2.1  20.4  38.8   6
1    54  0  172  95  138  163  ...     0.0    0.3    3.4  12.3  49.0  10
2    55  0  175  94  100  202  ...     0.0    0.4    2.6  34.6  61.6   1
3    75  0  190  80   88  181  ...     0.0   -0.1    3.9  25.4  62.8   7
4    13  0  169  51  100  167  ...     0.0    0.9    2.2  13.5  31.1  14
..   .. ..  ...  ..  ...  ...  ...     ...    ...    ...   ...   ...  ..
446  53  1  160  70   80  199  ...     0.0    0.7    0.6  -4.4  -0.5   1
447  37  0  190  85  100  137  ...     0.0    0.4    2.4  38.0  62.4  10
448  36  0  166  68  108  176  ...     0.0    1.5    1.0 -44.2 -33.2   2
449  32  1  155  55   93  106  ...     0.0    0.5    2.4  25.0  46.6   1
450  78  1  160  70   79  127  ...     0.0    0.5    1.6  21.3  32.8   1

[451 rows x 280 columns]


In [6]:
# We are changing the column names to the Features that we are considering
X = data[['75', '0', '190', '80', '63']]
X.columns = ['Age', 'Sex', 'Height', 'Weight', 'Heart rate']
d = {'Age' : [75], 'Sex' : [0], 'Height': [190], 'Weight' : [80], 'Heart rate' : [63]}
df2 = pd.DataFrame(data=d)
X = X.append(df2, ignore_index=True) # we want to append the data that we removed to install the new column names

# Similarly, since our dataset has 16 Different Classifications for Arrhythmia, we are changing the column name to Arrhythmia Class
y = data[[data.columns[279]]]
y.columns = ['Arrhythmia Class']
d = {'Arrhythmia Class' : [8]}
df2 = pd.DataFrame(data=d)
y = y.append(df2, ignore_index=True)

In [7]:
# Data Preprocessing - now we will handle data values that were not entered

# We shall fill any '?' values using interpolation

# We shall create a new dataframe that replaces '?' with np.nan
new_X_data = []

for array in X.values:
  temp_array = []
  for value in array:
    if value == '?':
      temp_array.append(np.nan)
    else:
      temp_array.append(int(value))
  new_X_data.append(temp_array)

new_X_data = np.array(new_X_data)

new_X = pd.DataFrame(data=new_X_data)
new_X.columns = ['Age', 'Sex', 'Height', 'Weight', 'Heart rate']
# Using back propagation to fill the last valid value
new_X = new_X.fillna(method='ffill')
print(new_X.head())
# Now

    Age  Sex  Height  Weight  Heart rate
0  56.0  1.0   165.0    64.0        53.0
1  54.0  0.0   172.0    95.0        75.0
2  55.0  0.0   175.0    94.0        71.0
3  75.0  0.0   190.0    80.0        71.0
4  13.0  0.0   169.0    51.0        84.0


In [8]:
sum = 0

for array in y.values:
  for value in array:
    if value == '?':
      sum += 1
print(sum) # No we know that there are no values to fill for Arrhythmia Class column



0


In [46]:
# We shall try two different models. One where the weights are uniform and one where points are weighted by the inverse of their distance
knn = KNeighborsClassifier(n_neighbors=25)

X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.2, random_state=0)
knn.fit(X_train, y_train.values.ravel())
y_pred = knn.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

xgb = XGBClassifier(n_estimators=500, random_state=0, n_jobs=4)
# xgb.fit(X_train, y_train.values.ravel(), early_stopping_rounds=5, eval_set=[(X_test, y_test)])
xgb.fit(X_train, y_train.values.ravel(), early_stopping_rounds=5, eval_set=[(X_test, y_test.values.ravel())], verbose=False)
y_pred = xgb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


Accuracy: 0.6043956043956044
Accuracy: 0.6703296703296703
