# K-Nearest Neighbors (K-NN)

### 參考課程實作並在datasets_483_982_spam.csv的資料集中獲得90% 以上的 accuracy (testset)

## Importing the libraries

In [170]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import codecs
import re

## Importing the dataset

In [171]:
dataset = pd.read_csv(r'datasets_483_982_spam.csv', encoding = 'latin-1')
dataset = dataset.loc[:, ~dataset.columns.str.match('Unnamed')]
dataset = dataset.rename(columns={'v1':'label','v2':'content'})
dataset['label'] = dataset['label'].map({'ham':0,'spam':1})
dataset

Unnamed: 0,label,content
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


### 取出訓練內文與標註

In [172]:
#方式一
X = dataset['content'].to_numpy()
Y = dataset['label'].to_numpy()

# #方式二
# all_data = dataset.to_numpy()
# X = all_data[:,1]
# Y = all_data[:,0]


In [173]:
print('Training Data Examples : \n{}'.format(X[:5]))

Training Data Examples : 
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 'U dun say so early hor... U c already then say...'
 "Nah I don't think he goes to usf, he lives around here though"]


In [174]:
print('Labeling Data Examples : \n{}'.format(Y[:5]))

Labeling Data Examples : 
[0 0 1 0 0]


### 文字預處理

In [175]:
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords

import nltk

#nltk.download('stopwords')

# Lemmatize with POS Tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

"""可以參考課程練習方式清理文字，或是使用自己的方式"""
def clean_content(X):
    X_clean = [re.sub('[^a-zA-z\s\']','',x).lower() for x in X]
    X_word_tokenize = [nltk.word_tokenize(word) for word in X_clean]
    stopword = set(stopwords.words('english'))

    contents = []
    for words in X_word_tokenize:
#         content = [''.join(word) for word in words if not word in stopword]
        for word in words:
            if word in stopword:
                words.remove(word)
        content = ' '.join(words)
#                 content = ''.join(word)
        contents.append(content)
    return contents

In [176]:
X = clean_content(X)
X[:3]

['go jurong point crazy available in bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 "free entry a wkly comp win fa cup final tkts st may text fa to receive entry questionstd txt ratetc 's apply 's"]

### Bag of words

In [177]:
from sklearn.feature_extraction.text import CountVectorizer
#max_features是要建造幾個column，會按造字出現的高低去篩選 
cv=CountVectorizer(max_features = 2000)
X=cv.fit_transform(X).toarray()

In [178]:
X.shape

(5572, 2000)

## Splitting the dataset into the Training set and Test set

In [179]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
print(y_test)

[0 0 0 ... 0 0 0]


## Training the K-NN model on the Training set

In [180]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

## Predicting a new result

In [181]:
print('Trainset Accuracy: {}'.format(classifier.score(X_train, y_train)))

Trainset Accuracy: 0.9367287413058111


In [182]:
print('Testset Accuracy: {}'.format(classifier.score(X_test, y_test)))

Testset Accuracy: 0.9147982062780269


## Predicting the Test set results

In [183]:
y_pred = classifier.predict(X_test)
print(y_pred)

[0 0 0 ... 0 0 0]


## Making the Confusion Matrix

In [184]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[949   0]
 [ 95  71]]


0.9147982062780269