In [1]:
## install the libraries needed
!pip install -U numpy pandas scikit-learn

Collecting numpy
  Using cached numpy-1.24.3-cp310-cp310-win_amd64.whl (14.8 MB)
Collecting pandas
  Using cached pandas-2.0.2-cp310-cp310-win_amd64.whl (10.7 MB)
Collecting scikit-learn
  Using cached scikit_learn-1.2.2-cp310-cp310-win_amd64.whl (8.3 MB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2023.3-py2.py3-none-any.whl (502 kB)
Collecting tzdata>=2022.1 (from pandas)
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)
Collecting scipy>=1.3.2 (from scikit-learn)
  Using cached scipy-1.10.1-cp310-cp310-win_amd64.whl (42.5 MB)
Collecting joblib>=1.1.1 (from scikit-learn)
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: pytz, tzdata, threadpoolctl, numpy, joblib, scipy, pandas, scikit-learn
Successfully installed joblib-1.2.0 numpy-1.24.3 pandas-2.0.2 pytz-2023.3 scikit-learn-1.2.2 scipy-1.10.1 threadpoolctl-3.1.0 tz

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split

In [33]:
def process_data(sms_data_str):
    """
    convert `sms_data_str` into a pandas dataframe
    """
    data_arr = []

    data_records = sms_data_str.split('\n')[:-1]
    for data in data_records:
        label = None
        sample = None
        match data[:3]:
            case 'ham':
                label = 'legitimate'
                sample = data[4:] 
            case 'spa':
                label = 'spam'
                sample = data[5:] 
            case _:
                label = 'N/A'
            
        data_arr.append([label, sample])
        
    data_arr = np.array(data_arr)
    data_label = data_arr[:, 0]
    data_records = data_arr[:, 1]
    
    return data_records, data_label

def tfidf_vectorizer(records):
    vectorizer = TfidfVectorizer(
        lowercase=True,
        token_pattern=r'\b[A-Za-z]+\b', 
        norm=None
    )
    
    records_transformed = vectorizer.fit_transform(records)

    return records_transformed.toarray(), vectorizer.get_feature_names_out()

def feature_extraction(X, n_components=5):
    reduction_pca = PCA(
        n_components=n_components,
        whiten=False
    )
    data_reduced = reduction_pca.fit_transform(X)
    return data_reduced

def feature_selection(df_records, labels, n_components=5):
    feature_selection_model = SelectKBest(mutual_info_classif, k=n_components) 
    ## make a selection over the best features
    selected_record_features = feature_selection_model.fit_transform(df_records, labels)
    
    return selected_record_features, feature_selection_model.get_feature_names_out()

In [34]:
sms_data_str = None
with open('SMSSpamCollection') as file:
    sms_data_str = file.read()

In [35]:
records, labels = process_data(sms_data_str)
records_vectorized, feature_names = tfidf_vectorizer(records)

## one hot encoding labels
labels = np.array([0 if y == 'legitimate' else 1 for y in labels] )

## reducing dimension
records_dim_reduced = feature_extraction(records_vectorized)

In [36]:
records_dim_reduced[:5]

array([[-1.85633695,  0.28521425, -1.18507131,  0.82337247,  0.70577113],
       [-2.78398492,  0.52113768, -1.74199668,  0.5025626 , -0.73501698],
       [ 0.48305217, -0.03564981,  2.01203303, -6.5307127 ,  1.00581204],
       [-1.83556689,  1.13917311, -3.93060813, -0.18386525, -1.98469975],
       [ 0.2770177 , -0.77835005,  0.11559472,  1.33628829, -0.75728842]])

In [37]:
records_vectorized = pd.DataFrame(records_vectorized, columns=feature_names)

records_selection, feature_name_selection = feature_selection(records_vectorized,labels=labels)

In [38]:
## for better visualization
pd.DataFrame(records_selection, columns=feature_name_selection).describe()

Unnamed: 0,call,claim,free,i,txt
count,5574.0,5574.0,5574.0,5574.0,5574.0
mean,0.352406,0.10004,0.213381,1.072582,0.143342
std,1.102998,0.72625,1.12693,1.797171,0.84952
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,1.992194,0.0
max,9.937499,9.869399,12.563905,23.906328,13.542179


In [45]:
pd.DataFrame(labels).head()


Unnamed: 0,0
0,0
1,0
2,1
3,0
4,0


In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    records_selection, labels, test_size=0.33, random_state=42, shuffle=True)


In [47]:
## TODO: build a fuzzy rule-based model for (records, label)
X_train

array([[0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 1.99219404, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 4.51405981],
       ...,
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

In [48]:
y_train.shape

(3734,)

In [87]:
X_train_true = []
y_train_true = []
X_test_true = []
y_test_true = []
mask = [0 for i in range(5574)]
cnt = 0
for i in range(5574):

    if labels[i] == 0:
        X_train_true.append(records_selection[i])
        y_train_true.append(labels[i])
        mask[i] = 1
        cnt +=1
    if cnt >= 500:
        break

cnt = 0
for i in range(5574):
    if labels[i] == 1:
        X_train_true.append(records_selection[i])
        y_train_true.append(labels[i])
        mask[i] = 1
        cnt +=1
    if cnt >= 500:
        break
cnt = 0
for i in range(5574):
    if labels[i] == 0 and mask[i] == 0:
        print("ok")
        X_test_true.append(records_selection[i])
        y_test_true.append(labels[i])
        cnt +=1
    if cnt >= 200:
        break
cnt = 0
for i in range(5574):
    if labels[i] == 1 and mask[i] == 0:
        print("ok")
        X_test_true.append(records_selection[i])
        y_test_true.append(labels[i])
        
        cnt +=1
    if cnt >= 200:
        break


In [82]:
X_train_true = np.array(X_train_true)
y_train_true = np.array(y_train_true)
X_test_true = np.array(X_test_true)
y_test_true = np.array(y_test_true)

In [83]:
np.save('X_train.npy', X_train_true)
np.save('y_train.npy', y_train_true)
np.save('X_test.npy', X_test_true)
np.save('y_test.npy', y_test_true)

In [84]:
x2 = np.load('y_test.npy')
# print the array
print(x2)

[]


In [85]:
X_test_true.shape

(0,)