In [17]:
import pandas as pd
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mustufa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mustufa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
df = pd.read_csv('mail_data.csv')

In [19]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [26]:
# label spam mail as 1;  ham mail as 0;

df.loc[df['Category'] == 'spam', 'Category',] = 1
df.loc[df['Category'] == 'ham', 'Category',] = 0
df.head()

Unnamed: 0,Category,Message
0,0,"Yep, by the pretty sculpture"
1,0,"Yes, princess. Are you going to make me moan?"
2,0,Welp apparently he retired
3,0,Havent.
4,0,I forgot 2 ask ü all smth.. There's a card on ...


In [27]:
df =df.sample(frac=1, random_state=1) 

df = df.reset_index(drop=True)

split_index = int(len(df) * 0.8)

train_df, test_df = df[:split_index], df[split_index:]

train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df.shape, test_df.shape

((4457, 2), (1115, 2))

In [28]:
tokenizer = nltk.RegexpTokenizer(r"\w+")

# from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [29]:
def message_to_token_list(s):
    tokens = tokenizer.tokenize(s)
    lowercased_tokens = [t.lower() for t in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
    useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]

    return useful_tokens
    
    

In [31]:
token_counter = {}

for message in train_df['Message']:
    message_as_token_list = message_to_token_list(message)

    for token in message_as_token_list:
        if token in token_counter:
            token_counter[token] += 1
        else:
            token_counter[token] = 1

token_counter



{'good': 192,
 'night': 101,
 'dear': 98,
 'sleepwell': 2,
 'amp': 72,
 'take': 122,
 'care': 58,
 'sen': 5,
 'told': 44,
 'going': 138,
 'join': 22,
 'uncle': 14,
 'finance': 1,
 'cbe': 4,
 'thank': 24,
 'baby': 27,
 'cant': 54,
 'wait': 59,
 'taste': 2,
 'real': 35,
 'thing': 97,
 'ü': 137,
 'come': 203,
 'wonderful': 10,
 'watching': 28,
 'telugu': 2,
 'movie': 23,
 'wat': 89,
 'abt': 23,
 'u': 989,
 'get': 317,
 'ready': 32,
 'moan': 6,
 'scream': 6,
 'babe': 71,
 'miiiiiiissssssssss': 1,
 'need': 152,
 'crave': 10,
 'geeee': 4,
 'sad': 16,
 'without': 24,
 'love': 180,
 'wan': 44,
 'lor': 122,
 'din': 6,
 'c': 132,
 'stripe': 1,
 'skirt': 1,
 'sent': 61,
 'wife': 20,
 'text': 169,
 'buy': 55,
 'tell': 120,
 'relax': 5,
 'go': 251,
 'wkend': 4,
 'want': 188,
 'send': 155,
 'something': 59,
 'sell': 11,
 'fast': 11,
 'lt': 224,
 'gt': 223,
 'k': 126,
 'easy': 24,
 'money': 51,
 'class': 42,
 'alright': 18,
 'omw': 6,
 'gotta': 12,
 'change': 23,
 'order': 16,
 'half8th': 1,
 'home':

In [33]:
def keep_token(proccessed_token, threshold):
    if proccessed_token not in token_counter:
        return False
    else:
        return token_counter[proccessed_token] > threshold
            
        

keep_token('html',400)

False

In [47]:
features = set()

for tokens in token_counter:
    if keep_token(tokens,225):
        features.add(tokens)

features

{'2', '4', 'call', 'free', 'get', 'go', 'ok', 'u', 'ur'}

In [48]:
features = list(features)
features

['get', 'u', 'go', 'ok', '4', 'call', 'ur', 'free', '2']

In [49]:
token_to_index_mapping = {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping 

{'get': 0,
 'u': 1,
 'go': 2,
 'ok': 3,
 '4': 4,
 'call': 5,
 'ur': 6,
 'free': 7,
 '2': 8}

In [50]:
import numpy as np

def message_to_count_vector(message):
  count_vector = np.zeros(len(features))

  processed_list_of_tokens = message_to_token_list(message)

  for token in processed_list_of_tokens:
    if token not in features:
      continue
    index = token_to_index_mapping[token]
    count_vector[index] += 1
  
  return count_vector

In [51]:
message_to_count_vector(train_df['Message'].iloc[3])

array([0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [53]:
def df_to_X_y(dff):
  y = dff['Category'].to_numpy().astype(int)

  message_col = dff['Message']
  count_vectors = []

  for message in message_col:
    count_vector = message_to_count_vector(message)
    count_vectors.append(count_vector)

  X = np.array(count_vectors).astype(int)

  return X, y

In [54]:
X_train, y_train = df_to_X_y(train_df)

X_test, y_test = df_to_X_y(test_df)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4457, 9), (4457,), (1115, 9), (1115,))

In [55]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train)

X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression().fit(X_train, y_train)
print(classification_report(y_test, lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94       962
           1       0.79      0.29      0.43       153

    accuracy                           0.89      1115
   macro avg       0.84      0.64      0.68      1115
weighted avg       0.88      0.89      0.87      1115



In [57]:
# Compare logistic regression to random forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier().fit(X_train, y_train)
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95       962
           1       0.69      0.56      0.62       153

    accuracy                           0.91      1115
   macro avg       0.81      0.76      0.78      1115
weighted avg       0.90      0.91      0.90      1115

