In [1]:
## install the libraries needed
!pip install -U numpy pandas sklearn

^C


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [3]:
def process_data(sms_data_str):
    """
    convert `sms_data_str` into a pandas dataframe
    """
    data_arr = []

    data_records = sms_data_str.split('\n')[:-1]
    for data in data_records:
        label = None
        sample = None
        match data[:3]:
            case 'ham':
                label = 'legitimate'
                sample = data[4:] 
            case 'spa':
                label = 'spam'
                sample = data[5:] 
            case _:
                label = 'N/A'
            
        data_arr.append([label, sample])
        
    data_arr = np.array(data_arr)
    data_label = data_arr[:, 0]
    data_records = data_arr[:, 1]
    
    return data_records, data_label

def tfidf_vectorizer(records):
    vectorizer = TfidfVectorizer(
        lowercase=True,
        token_pattern=r'\b[A-Za-z]+\b', 
        norm=None
    )
    
    records_transformed = vectorizer.fit_transform(records)

    return records_transformed.toarray(), vectorizer.get_feature_names_out()

def feature_extraction(X, n_components=5):
    reduction_pca = PCA(
        n_components=n_components,
        whiten=False
    )
    data_reduced = reduction_pca.fit_transform(X)
    return data_reduced

def feature_selection(df_records, labels, n_components=5):
    feature_selection_model = SelectKBest(mutual_info_classif, k=n_components) 
    ## make a selection over the best features
    selected_record_features = feature_selection_model.fit_transform(df_records, labels)
    
    return selected_record_features, feature_selection_model.get_feature_names_out()

In [4]:
sms_data_str = None
with open('SMSSpamCollection') as file:
    sms_data_str = file.read()

In [5]:
records, labels = process_data(sms_data_str)
records_vectorized, feature_names = tfidf_vectorizer(records)

## one hot encoding labels
labels = np.array([0 if y == 'legitimate' else 1 for y in labels] )

## reducing dimension
records_dim_reduced = feature_extraction(records_vectorized)

In [6]:
records_dim_reduced[:5]

array([[-1.85636164,  0.28188535, -1.18452107,  0.821724  ,  0.72011984],
       [-2.78399376,  0.52092912, -1.74215256,  0.50191261, -0.73367916],
       [ 0.48305198, -0.04104113,  2.01228462, -6.52683892,  1.00366916],
       [-1.83558084,  1.13906569, -3.93093139, -0.18465917, -1.98317997],
       [ 0.27697309, -0.78039086,  0.11194451,  1.31737235, -0.77779754]])

In [7]:
records_vectorized = pd.DataFrame(records_vectorized, columns=feature_names)

records_selection, feature_name_selection = feature_selection(records_vectorized,labels=labels)

Collecting numpy
  Downloading numpy-1.24.3-cp310-cp310-win_amd64.whl (14.8 MB)
                                              0.0/14.8 MB ? eta -:--:--
                                              0.0/14.8 MB ? eta -:--:--
                                              0.0/14.8 MB ? eta -:--:--
                                              0.0/14.8 MB ? eta -:--:--
                                              0.0/14.8 MB ? eta -:--:--
                                             0.0/14.8 MB 119.1 kB/s eta 0:02:05
                                             0.0/14.8 MB 119.1 kB/s eta 0:02:05
                                             0.0/14.8 MB 119.1 kB/s eta 0:02:05
                                             0.0/14.8 MB 109.3 kB/s eta 0:02:16
                                             0.1/14.8 MB 148.8 kB/s eta 0:01:40
                                             0.1/14.8 MB 157.1 kB/s eta 0:01:34
                                             0.1/14.8 MB 187.0 kB/s eta 0:01:19


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\rrast\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\~umpy\\.libs\\libopenblas64__v0.3.21-gcc_10_3_0.dll'
Consider using the `--user` option or check the permissions.



In [None]:
## for better visualization
pd.DataFrame(records_selection, columns=feature_name_selection).head()

In [48]:
## TODO: build a fuzzy rule-based model for (records, label)
