In [3]:
## install the libraries needed
!pip install -U numpy pandas scikit-learn seaborn

Collecting seaborn
  Using cached seaborn-0.12.2-py3-none-any.whl (293 kB)
Collecting matplotlib!=3.6.1,>=3.1 (from seaborn)
  Using cached matplotlib-3.7.1-cp310-cp310-win_amd64.whl (7.6 MB)
Collecting contourpy>=1.0.1 (from matplotlib!=3.6.1,>=3.1->seaborn)
  Using cached contourpy-1.0.7-cp310-cp310-win_amd64.whl (162 kB)
Collecting cycler>=0.10 (from matplotlib!=3.6.1,>=3.1->seaborn)
  Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0 (from matplotlib!=3.6.1,>=3.1->seaborn)
  Using cached fonttools-4.39.4-py3-none-any.whl (1.0 MB)
Collecting kiwisolver>=1.0.1 (from matplotlib!=3.6.1,>=3.1->seaborn)
  Using cached kiwisolver-1.4.4-cp310-cp310-win_amd64.whl (55 kB)
Collecting pillow>=6.2.0 (from matplotlib!=3.6.1,>=3.1->seaborn)
  Using cached Pillow-9.5.0-cp310-cp310-win_amd64.whl (2.5 MB)
Collecting pyparsing>=2.3.1 (from matplotlib!=3.6.1,>=3.1->seaborn)
  Using cached pyparsing-3.0.9-py3-none-any.whl (98 kB)
Installing collected packages: pyparsing,

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split

In [5]:
def process_data(sms_data_str):
    """
    convert `sms_data_str` into a pandas dataframe
    """
    data_arr = []

    data_records = sms_data_str.split('\n')[:-1]
    for data in data_records:
        label = None
        sample = None
        match data[:3]:
            case 'ham':
                label = 'legitimate'
                sample = data[4:] 
            case 'spa':
                label = 'spam'
                sample = data[5:] 
            case _:
                label = 'N/A'
            
        data_arr.append([label, sample])
        
    data_arr = np.array(data_arr)
    data_label = data_arr[:, 0]
    data_records = data_arr[:, 1]
    
    return data_records, data_label

def tfidf_vectorizer(records):
    vectorizer = TfidfVectorizer(
        lowercase=True,
        token_pattern=r'\b[A-Za-z]+\b', 
        norm=None
    )
    
    records_transformed = vectorizer.fit_transform(records)

    return records_transformed.toarray(), vectorizer.get_feature_names_out()

def feature_extraction(X, n_components=5):
    reduction_pca = PCA(
        n_components=n_components,
        whiten=False
    )
    data_reduced = reduction_pca.fit_transform(X)
    return data_reduced

def feature_selection(df_records, labels, n_components=5):
    feature_selection_model = SelectKBest(mutual_info_classif, k=n_components) 
    ## make a selection over the best features
    selected_record_features = feature_selection_model.fit_transform(df_records, labels)
    
    return selected_record_features, feature_selection_model.get_feature_names_out()

In [6]:
sms_data_str = None
with open('SMSSpamCollection') as file:
    sms_data_str = file.read()

In [7]:
records, labels = process_data(sms_data_str)
records_vectorized, feature_names = tfidf_vectorizer(records)

## one hot encoding labels
labels = np.array([0 if y == 'legitimate' else 1 for y in labels] )

## reducing dimension
records_dim_reduced = feature_extraction(records_vectorized)

In [34]:
records_dim_reduced[:5]
pd.DataFrame(records_dim_reduced).describe()

Unnamed: 0,0,1,2,3,4
count,5574.0,5574.0,5574.0,5574.0,5574.0
mean,-1.909058e-14,8.877323e-15,-1.901919e-15,-1.169515e-14,4.984252e-15
std,3.017912,2.312965,2.243848,2.136352,1.954386
min,-3.061859,-14.67491,-37.58494,-12.5982,-36.79861
25%,-1.756941,-0.6765125,-0.5275579,-0.4525954,-0.6640831
50%,-0.7251485,-0.1996032,0.08828523,0.4829967,0.112989
75%,0.7864874,0.1717623,0.9082959,1.128859,0.658049
max,63.28925,96.1106,42.39448,19.00297,47.44004


In [38]:
records_vectorized = pd.DataFrame(records_vectorized, columns=feature_names)

records_selection, feature_name_selection = feature_selection(records_vectorized,labels=labels)

In [39]:
## for better visualization
pd.DataFrame(records_selection, columns=feature_name_selection).describe()

Unnamed: 0,call,claim,free,to,txt
count,5574.0,5574.0,5574.0,5574.0,5574.0
mean,0.352406,0.10004,0.213381,0.887113,0.143342
std,1.102998,0.72625,1.12693,1.586578,0.84952
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,2.194748,0.0
max,9.937499,9.869399,12.563905,17.557983,13.542179


In [11]:
pd.DataFrame(labels).head()


Unnamed: 0,0
0,0
1,0
2,1
3,0
4,0


In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    records_selection, labels, test_size=0.33, stratify=labels)


In [24]:
## TODO: build a fuzzy rule-based model for (records, label)
X_train

array([[0.        , 0.        , 0.        , 2.19474792, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 1.99219404, 0.        , 0.        ],
       ...,
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 3.98438807, 0.        , 0.        ]])

In [25]:
y_train.shape

(3734,)

In [87]:
X_train_true = []
y_train_true = []
X_test_true = []
y_test_true = []
mask = [0 for i in range(5574)]
cnt = 0
for i in range(5574):

    if labels[i] == 0:
        X_train_true.append(records_selection[i])
        y_train_true.append(labels[i])
        mask[i] = 1
        cnt +=1
    if cnt >= 500:
        break

cnt = 0
for i in range(5574):
    if labels[i] == 1:
        X_train_true.append(records_selection[i])
        y_train_true.append(labels[i])
        mask[i] = 1
        cnt +=1
    if cnt >= 500:
        break
cnt = 0
for i in range(5574):
    if labels[i] == 0 and mask[i] == 0:
        print("ok")
        X_test_true.append(records_selection[i])
        y_test_true.append(labels[i])
        cnt +=1
    if cnt >= 200:
        break
cnt = 0
for i in range(5574):
    if labels[i] == 1 and mask[i] == 0:
        print("ok")
        X_test_true.append(records_selection[i])
        y_test_true.append(labels[i])
        
        cnt +=1
    if cnt >= 200:
        break


In [82]:
X_train_true = np.array(X_train_true)
y_train_true = np.array(y_train_true)
X_test_true = np.array(X_test_true)
y_test_true = np.array(y_test_true)

In [26]:
np.save('X_train.npy', X_train_true)
np.save('y_train.npy', y_train_true)
np.save('X_test.npy', X_test_true)
np.save('y_test.npy', y_test_true)

NameError: name 'X_train_true' is not defined

In [27]:
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_test.npy', X_test)
np.save('y_test.npy', y_test)

In [28]:
x2 = np.load('y_test.npy')
# print the array
print(x2.max())

1


In [29]:
X_test_true.shape

NameError: name 'X_test_true' is not defined