In [54]:
import pandas as pd
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mustufa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mustufa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [55]:
df = pd.read_csv('mail_data.csv')

In [56]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [57]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [58]:
# label spam mail as 1;  ham mail as 0;

df.loc[df['Category'] == 'spam', 'Category',] = 1
df.loc[df['Category'] == 'ham', 'Category',] = 0
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [59]:
df =df.sample(frac=1, random_state=1) 

df = df.reset_index(drop=True)

split_index = int(len(df) * 0.8)

train_df, test_df = df[:split_index], df[split_index:]

train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df.shape, test_df.shape

((4457, 2), (1115, 2))

In [60]:
tokenizer = nltk.RegexpTokenizer(r"\w+")

# from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [61]:
def message_to_token_list(s):
    tokens = tokenizer.tokenize(s)
    lowercased_tokens = [t.lower() for t in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
    useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]

    return useful_tokens
    
    

In [62]:
token_counter = {}

for message in train_df['Message']:
    message_as_token_list = message_to_token_list(message)

    for token in message_as_token_list:
        if token in token_counter:
            token_counter[token] += 1
        else:
            token_counter[token] = 1

token_counter



{'yep': 9,
 'pretty': 12,
 'sculpture': 1,
 'yes': 81,
 'princess': 24,
 'going': 146,
 'make': 103,
 'moan': 5,
 'welp': 3,
 'apparently': 4,
 'retired': 1,
 'havent': 20,
 'forgot': 25,
 '2': 427,
 'ask': 76,
 'ü': 128,
 'smth': 15,
 'card': 13,
 'da': 113,
 'present': 8,
 'lei': 17,
 'want': 170,
 'write': 7,
 'sign': 6,
 'ok': 236,
 'thk': 40,
 'got': 176,
 'u': 1006,
 'wan': 46,
 'come': 205,
 'wat': 80,
 'kfc': 1,
 'tuesday': 6,
 'buy': 57,
 'meal': 5,
 'gravy': 1,
 'mark': 6,
 'dear': 96,
 'wa': 191,
 'sleeping': 18,
 'p': 11,
 'pa': 28,
 'nothing': 26,
 'problem': 39,
 'ill': 36,
 'lt': 252,
 'gt': 255,
 'uncle': 15,
 'atlanta': 3,
 'wish': 55,
 'guy': 54,
 'great': 93,
 'semester': 10,
 'phone': 103,
 'another': 33,
 'number': 91,
 'greatest': 2,
 'test': 23,
 'courage': 2,
 'earth': 6,
 'bear': 4,
 'defeat': 2,
 'without': 23,
 'losing': 3,
 'heart': 39,
 'gn': 13,
 'tc': 16,
 'dai': 2,
 'send': 168,
 'resume': 1,
 'id': 17,
 'late': 42,
 'freemsg': 11,
 'replied': 4,
 'text'

In [63]:
def keep_token(proccessed_token, threshold):
    if proccessed_token not in token_counter:
        return False
    else:
        return token_counter[proccessed_token] > threshold
            
        

keep_token('html',400)

False

In [64]:
features = set()

for tokens in token_counter:
    if keep_token(tokens,225):
        features.add(tokens)

features

{'2', '4', 'call', 'day', 'get', 'go', 'gt', 'lt', 'ok', 'u', 'ur'}

In [65]:
features = list(features)
features

['2', 'go', 'lt', 'ur', 'get', 'ok', 'day', 'gt', 'call', '4', 'u']

In [66]:
token_to_index_mapping = {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping 

{'2': 0,
 'go': 1,
 'lt': 2,
 'ur': 3,
 'get': 4,
 'ok': 5,
 'day': 6,
 'gt': 7,
 'call': 8,
 '4': 9,
 'u': 10}

In [67]:
import numpy as np

def message_to_count_vector(message):
  count_vector = np.zeros(len(features))

  processed_list_of_tokens = message_to_token_list(message)

  for token in processed_list_of_tokens:
    if token not in features:
      continue
    index = token_to_index_mapping[token]
    count_vector[index] += 1
  
  return count_vector

In [68]:
message_to_count_vector(train_df['Message'].iloc[3])

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [69]:
def df_to_X_y(dff):
  y = dff['Category'].to_numpy().astype(int)

  message_col = dff['Message']
  count_vectors = []

  for message in message_col:
    count_vector = message_to_count_vector(message)
    count_vectors.append(count_vector)

  X = np.array(count_vectors).astype(int)

  return X, y

In [70]:
X_train, y_train = df_to_X_y(train_df)

X_test, y_test = df_to_X_y(test_df)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4457, 11), (4457,), (1115, 11), (1115,))

In [71]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train)

X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

X_train

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.33333333, 0.        ,
        0.11111111],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

### Model Training

Using Logistic Regression

In [72]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [73]:
# prediction on training data

prediction_on_training_data = lr.predict(X_train)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.8849001570563159


In [74]:
# prediction on test data

prediction_on_test_data = lr.predict(X_test)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.8834080717488789


In [75]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
print('Confusion Matrix:\n',confusion_matrix(y_test, prediction_on_test_data))
from sklearn.metrics import classification_report
print('Calssification Report:\n',classification_report(y_test, prediction_on_test_data))

Confusion Matrix:
 [[954  14]
 [116  31]]
Calssification Report:
               precision    recall  f1-score   support

           0       0.89      0.99      0.94       968
           1       0.69      0.21      0.32       147

    accuracy                           0.88      1115
   macro avg       0.79      0.60      0.63      1115
weighted avg       0.86      0.88      0.86      1115



Using Random Forest Classifier

In [76]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [77]:
# prediction on training data

rf_prediction_on_training_data = rf.predict(X_train)
rf_accuracy_on_training_data = accuracy_score(y_train, rf_prediction_on_training_data)

print('Accuracy on training data : ', rf_accuracy_on_training_data)

Accuracy on training data :  0.90643930895221


In [78]:
# prediction on test data

rf_prediction_on_test_data = rf.predict(X_test)
rf_accuracy_on_test_data = accuracy_score(y_test, rf_prediction_on_test_data)

print('Accuracy on test data : ', rf_accuracy_on_test_data)

Accuracy on test data :  0.9040358744394619


In [79]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
print('Confusion Matrix:\n',confusion_matrix(y_test, rf_prediction_on_test_data))
from sklearn.metrics import classification_report
print('Calssification Report:\n',classification_report(y_test, rf_prediction_on_test_data))

Confusion Matrix:
 [[929  39]
 [ 68  79]]
Calssification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95       968
           1       0.67      0.54      0.60       147

    accuracy                           0.90      1115
   macro avg       0.80      0.75      0.77      1115
weighted avg       0.90      0.90      0.90      1115

