In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. 讀取 spam.csv

* 定義 `readData_rawSMS` function
    * header
        * `0`: 第一列(橫)為欄位名稱
            > 即 v1, v2
        * `1`: 第二列(橫)為欄位名稱
            > 即 ham, Go until jurong point, crazy.. Available only ...
        * `None`: 本資料(spam.csv)沒有欄位名稱
    * usecols
        * `[0,1]`: 僅使用第一行(直)和第二行(直)的資料，其他行(直)略過不讀取且不使用。
    * data_rawSMS.columns = ['label', 'content']
        > 重新命名欄位名稱：由 `v1, v2` 改為 `label, content`

In [None]:
def readData_rawSMS(filepath):
    data_rawSMS = pd.read_csv(filepath, header=0, usecols=[0,1], encoding='latin-1') #
    data_rawSMS.columns = ['label', 'content']
    return data_rawSMS

data_rawSMS = readData_rawSMS(os.path.join(dirname, filename))
data_rawSMS.head()

In [None]:
data_rawSMS.describe()

In [None]:
data_rawSMS.describe()

# 2. 資料集拆成 Train 和 Test
## **(偷懶版) 直接使用 random 切割**
```
* 首先，讓每筆資料隨機產生介於 0 ~ 1 的數字。
* 接著，數字 >= 0.5 當作 Training Data；其他 (數字 < 0.5 ) 則為 Testing Data。
    > 此方法將導致 Training Data 和 Testing Data 的比例 1:1
```
* 如何知道 dataframe 的大小？使用 `shape`
    * `data_rawSMS.shape` 顯示 (rows,columns) -> (5572,2)
* `np.random.rand(n)`
    * Random values in a given shape.
        > Ex: [0.60025928 0.18572491 0.90311005 ... 0.57453736 0.37580751 0.57922529]
        > https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.random.rand.html
* `np.where(tmp_train == True)`
    * Return elements chosen from x or y depending on condition. (回傳符合條件的 index)
        > https://numpy.org/doc/stable/reference/generated/numpy.where.html
* `data_rawSMS.iloc[index]`
    * Purely integer-location based indexing for selection by position. (根據 index 取 rows)
        > https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html

## **(正規版) Cross-validation**
* 考慮「Spam」和「Ham」的分佈

In [None]:
### 偷懶版 ###

def Separate_TrainAndTest(data_rawSMS):
    n = data_rawSMS.shape[0]
    tmp_train = (np.random.rand(n) >= 0.5)
    # print(np.random.rand(n), tmp_train)
    return data_rawSMS.iloc[np.where(tmp_train == True)[0]], data_rawSMS.iloc[np.where(tmp_train == False)[0]]

data_rawTrain, data_rawTest = Separate_TrainAndTest(data_rawSMS)
print(data_rawTrain)
print(data_rawTrain[data_rawTrain['label']=='ham'].shape[0])
print(data_rawTest)

# 3. 手工取特徵 (特徵將作為後續分類使用)
* 從 Training Data 計算哪些「詞」重要。
* generate_key_list function
    * size_table
        * 要選多少個重要的「詞」出來，等於決定特徵向量的維度。
        * Default: 200 words
    * ignore
        * 英文字，字長少於幾個以下就不要算。
        * `I`: 1個字
        * `no`: 2個字
        * Default: 3個字

In [None]:
import re
from collections import defaultdict
# 垃圾訊息(spam)
# 有效訊息(ham)

def generate_key_list(data_rawTrain, size_table=200, word_len_ignored=3):
    dict_spam_raw = defaultdict(lambda:0)
    dict_ham_raw = defaultdict(lambda:0)
    dict_IDF = defaultdict(lambda:0)

    for i in range(data_rawTrain.shape[0]):                               #data_rawSMS
        # Separate sentences with spaces
        finds = re.findall('[A-Za-z]+', data_rawTrain.iloc[i].content)    #data_rawSMS
        
        if data_rawTrain.iloc[i].label == 'spam':                         #data_rawSMS
            for find in finds:
                if len(find) < word_len_ignored:  # word length < 3
                    continue
                else:
                    find = find.lower() # 英文轉成小寫
                    dict_spam_raw[find] += 1
                    dict_ham_raw[find] += 0
        else:
            for find in finds:
                if len(find) < word_len_ignored:
                    continue
                else:
                    find = find.lower()
                    dict_ham_raw[find] += 1
                    dict_spam_raw[find] += 0
        
        word_set = set()
        for find in finds:
            if len(find) < word_len_ignored:
                continue
            else:
                find = find.lower()
                dict_IDF[find] += 1
                word_set.add(find)
        # print(dict_IDF, word_set)
        
    word_df = pd.DataFrame(list(zip(dict_ham_raw.keys(), dict_ham_raw.values(), dict_spam_raw.values(), dict_IDF.values())))
    word_df.columns = ['keyword', 'ham', 'spam', 'IDF']
    print(word_df)
    
    print('### Training Data ###')
    print('TF(word) =', word_df['ham'][0], '/', data_rawTrain[data_rawTrain['label']=='ham'].shape[0], '= one_word_frequency / Label_ham_DocumentCounts')
    word_df['ham'] = word_df['ham'].astype('float')/data_rawTrain[data_rawTrain['label']=='ham'].shape[0]
    print('TF(word) =', word_df['spam'][0], '/', data_rawTrain[data_rawTrain['label']=='spam'].shape[0], '= one_word_frequency / Label_spam_DocumentCounts')
    word_df['spam'] = word_df['spam'].astype('float')/data_rawTrain[data_rawTrain['label']=='spam'].shape[0]
    
    print('\nTrainingData_word_count:', word_df.shape[0])
    TrainingData_word_count = word_df.shape[0]
    print('IDF(word) = log(', TrainingData_word_count, '/', word_df['IDF'][0], ') = log( TrainingData_total_WordCount / one_word_count(ham+spam) )')
    word_df['IDF'] = np.log10(TrainingData_word_count/word_df['IDF'].astype('float'))
    
    
    print('\n### score = TF * IDF ###')
    word_df['ham_score'] = word_df['ham'] * word_df['IDF']
    word_df['spam_score'] = word_df['spam'] * word_df['IDF']
    word_df['diff'] = word_df['spam_score'] - word_df['ham_score'] # spam - ham 值越大，代表該字越常出現在 spam(垃圾訊息) 裡。
    print(word_df)
    
    selected_spam_key = word_df.sort_values('diff', ascending=False)
    # print(selected_spam_key)
    
    keyword_dict = dict()
    i = 0
    for word in selected_spam_key.head(size_table).keyword:
        keyword_dict.update({word.strip():i})
        i+=1
    print(keyword_dict, len(keyword_dict))
    return keyword_dict

# build a tabu list based on the training data
# size_table: how many features are used to classify spam
# word_len_ignored: ignore those words shorter than this variable
keyword_dict = generate_key_list(data_rawTrain, size_table=300, word_len_ignored=3)

# 4. Train 和 Test 轉為特徵向量

In [None]:
def convert_Content(content, keyword_dict):
	m = len(keyword_dict)
	res = np.int_(np.zeros(m))
	finds = re.findall('[A-Za-z]+', content)
	for find in finds:
		find = find.lower()
		try:
			i = keyword_dict[find]
			res[i] = 1
		except:
			continue
	return res

def raw2feature(data_rawTrain, data_rawTest, keyword_dict):
    n_train = data_rawTrain.shape[0]
    n_test = data_rawTest.shape[0]
    m = len(keyword_dict)
    
    print(n_train, n_test, m)
    X_train = np.zeros((n_train, m))
    X_test = np.zeros((n_test, m))
    
    Y_train = np.int_(data_rawTrain.label=='spam')
    print(Y_train)
    Y_test = np.int_(data_rawTest.label=='spam')
    
    for i in range(n_train):
        X_train[i,:] = convert_Content(data_rawTrain.iloc[i].content, keyword_dict)
    for i in range(n_test):
        X_test[i,:] = convert_Content(data_rawTest.iloc[i].content, keyword_dict)
        
    return [X_train, Y_train], [X_test, Y_test]
     
Train, Test = raw2feature(data_rawTrain, data_rawTest, keyword_dict)

In [None]:
# 5. 依據特徵資料訓練分類器
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

Random_forest = RandomForestClassifier(n_estimators=50)
X_train = Train[0]
y_train = Train[1]
Random_forest.fit(X_train, y_train)


X_test = Test[0]
y_test = Test[1]
randomForest_predict = Random_forest.predict(X_test)
randomForest_score = metrics.accuracy_score(y_test, randomForest_predict)
print("(Testing) Random Forest Score :", randomForest_score)

Y_hat = Random_forest.predict(X_test)
n = np.size(y_test)
print('Testing Accuarcy: {:.6f}％ ({})'.format(sum(np.int_(Y_hat==y_test))*100./n, Random_forest.__module__))


n=np.size(Train[1])
Y_hat_RF = Random_forest.predict(X_train)
print('Training Accuarcy RF: {:.2f}％'.format(sum(np.int_(Y_hat_RF==Train[1]))*100./n))

In [None]:
def predictSMS(SMS,model,keyword_dict):
    X = convert_Content(SMS, keyword_dict)
    Y_hat = model.predict(X.reshape(1,-1))
    if int(Y_hat) == 1:
        print ('SPAM: {}'.format(SMS))
    else:
        print ('ham: {}'.format(SMS))    

inputstr='go to visit www.yahoo.com.tw, Buy one get one free, Hurry!'
predictSMS(inputstr, Random_forest, keyword_dict)

inputstr=('Call back for anytime.')
predictSMS(inputstr, Random_forest, keyword_dict)