In [1]:
#import dependencies
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

In [2]:
#read csv and create initial DF
df = pd.read_csv('Resources/clean_data/combineddata.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,us_state,county,days_with_aqi2019,good_days2019,moderate_days2019,unhealthy_sensitive_days2019,unhealthy_days2019,very_unhealthy_days2019,hazardous_days2019,...,average_hazardous_days,average_co_days,average_no2_days,average_ozone_days,average_so2_days,average_pm2_days,average_pm10_days,average_poverty_percentage,percent_pop_per_100k,cancer_classification
0,0,AL,Baldwin,271,237,34,0,0,0,0,...,0.0,0.0,0.0,215.25,0.0,57.25,0.0,10.425,20.286986,high
1,1,AL,Clay,107,97,10,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,112.75,0.0,17.8,25.963561,high
2,2,AL,Colbert,263,252,11,0,0,0,0,...,0.0,0.0,0.0,218.5,0.0,57.75,0.0,15.125,19.761467,medium
3,3,AL,DeKalb,361,324,37,0,0,0,0,...,0.0,0.0,0.0,321.0,0.0,33.5,0.0,19.875,15.357483,medium
4,4,AL,Elmore,228,208,20,0,0,0,0,...,0.0,0.0,0.0,198.25,0.0,0.0,0.0,12.0275,16.158522,medium


In [3]:
#keep average air quality, poverty, and classification columns
new_df = df[['average_good_days', 'average_moderate_days', 'average_unhealthy_sensitive_days', 'average_unhealthy_days',
            'average_very_unhealthy_days', 'average_hazardous_days', 'average_co_days', 'average_no2_days', 'average_ozone_days',
            'average_so2_days', 'average_pm2_days', 'average_pm10_days', 'average_poverty_percentage', 'cancer_classification']]


new_df

Unnamed: 0,average_good_days,average_moderate_days,average_unhealthy_sensitive_days,average_unhealthy_days,average_very_unhealthy_days,average_hazardous_days,average_co_days,average_no2_days,average_ozone_days,average_so2_days,average_pm2_days,average_pm10_days,average_poverty_percentage,cancer_classification
0,242.50,29.75,0.25,0.00,0.00,0.00,0.0,0.0,215.25,0.0,57.25,0.00,10.4250,high
1,103.25,9.50,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.0,112.75,0.00,17.8000,high
2,256.50,19.50,0.25,0.00,0.00,0.00,0.0,0.0,218.50,0.0,57.75,0.00,15.1250,medium
3,318.25,36.00,0.25,0.00,0.00,0.00,0.0,0.0,321.00,0.0,33.50,0.00,19.8750,medium
4,184.75,13.50,0.00,0.00,0.00,0.00,0.0,0.0,198.25,0.0,0.00,0.00,12.0275,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,256.00,104.25,4.25,0.75,0.00,0.00,0.0,0.0,350.25,0.0,14.50,0.50,6.8250,no data
996,257.75,102.75,3.75,0.75,0.25,0.00,0.0,2.0,312.50,4.0,5.25,41.50,9.1275,low
997,310.50,54.25,0.50,0.00,0.00,0.00,0.0,0.0,347.75,0.0,16.75,0.75,6.3525,low
998,313.25,51.00,0.75,0.00,0.00,0.25,0.0,1.5,355.25,0.0,0.00,8.50,9.8275,low


In [8]:
#drop rows with null or NaN values
new_df.drop(index=new_df[new_df['cancer_classification'] == 'no data'].index, inplace=True)
new_df = new_df.dropna()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
#set X and y
X = clean_df.drop('cancer_classification', axis=1)
y = clean_df['cancer_classification']

print(X.shape, y.shape)

In [None]:
#train, test, split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
#import Standard Scaler
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)


#tranform scaled data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
#Encode Labels 
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)

encoded_y_test = label_encoder.transform(y_test)

In [None]:
#One-hot encoding
from keras.utils import to_categorical

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
y_train_categorical

In [None]:
## Create a KNN model and fit it to the scaled training data
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Loop through different k values to see which has the highest accuracy
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train_categorical)
    train_score = knn.score(X_train_scaled, y_train_categorical)
    test_score = knn.score(X_test_scaled, y_test_categorical)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

In [None]:
#plot KNN train and test data
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
#Select best K value to fit and score data - visually K=7 appears to be at elbow
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_scaled, y_train_categorical)

#print train and test scores
print('k=7 Train Acc: %.3f' % knn.score(X_train_scaled, y_train_categorical))
print('k=7 Test Acc: %.3f' % knn.score(X_test_scaled, y_test_categorical))