## Spam-Ham Detection

In [2]:
import warnings
warnings.simplefilter('ignore')

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

# from sklearn.model_selection import train_test_split 
# from sklearn.feature_extraction.text import TfidfTransformer 
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# from sklearn.linear_model import LogisticRegression 
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.svm import SVC 
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier, StackingClassifier

## Data Importing

In [3]:
df = pd.read_csv("../dataset/SPAM.csv")

In [4]:
df

Unnamed: 0,Category,Message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


## Data Cleaning and Preprocessing

In [5]:
df = df.drop(labels=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
## checking missing values
df.isnull().sum()

Category    0
Message     0
dtype: int64

Observation : No missing values in the dataset

In [7]:
df.dtypes

Category    object
Message     object
dtype: object

Observation : Both features are of string datatype

In [8]:
df = pd.get_dummies(df, columns=['Category'], drop_first=True)

In [9]:
df

Unnamed: 0,Message,Category_spam
0,"Go until jurong point, crazy.. Available only ...",False
1,Ok lar... Joking wif u oni...,False
2,Free entry in 2 a wkly comp to win FA Cup fina...,True
3,U dun say so early hor... U c already then say...,False
4,"Nah I don't think he goes to usf, he lives aro...",False
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,True
5568,Will Ì_ b going to esplanade fr home?,False
5569,"Pity, * was in mood for that. So...any other s...",False
5570,The guy did some bitching but I acted like i'd...,False


In [10]:
df['Category_spam'] = df['Category_spam'].astype(int)

In [11]:
X = df['Message']

In [12]:
y = df['Category_spam']

In [13]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category_spam, Length: 5572, dtype: int64

## Data Splitting

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [15]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


## Feature extraction

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

In [17]:
X_train_feature = feature_extraction.fit_transform(X_train)
X_test_feature = feature_extraction.transform(X_test)

## Model training

In [18]:
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import GaussianNB 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier
from sklearn.svm import SVC

In [19]:
models = {
    "LogisticRegression" : LogisticRegression(),
    "KNeighborsClassifier" : KNeighborsClassifier(),
    "DecisionTreeClassifier" : DecisionTreeClassifier(),
    "RandomForestClassifier" : RandomForestClassifier(),
    "AdaBoostClassifier" : AdaBoostClassifier(),
    "GradientBoostingClassifier" : GradientBoostingClassifier(),
    "SVC" : SVC(),
    "GaussianNB" : GaussianNB(),
}

In [20]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report, f1_score

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    return accuracy, precision, recall, f1

In [21]:
X_train_feature

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34794 stored elements and shape (4457, 7472)>

In [25]:
Y_test = y_test.astype('int')


In [26]:
Y_test

3245    0
944     0
1044    1
2484    0
812     1
       ..
4264    0
2439    0
5556    0
4205    0
4293    1
Name: Category_spam, Length: 1115, dtype: int64

In [27]:
lr = LogisticRegression()
lr.fit(X_train_feature, y_train)

lr_train = lr.predict(X_test_feature)


accuracy_score(y_test, lr_train)

TypeError: 'list' object is not callable

In [22]:
accuracy_score = []
models_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model_name = list(models.keys())[i]

    # Predictions
    model.fit(X_train_feature, y_train)
    y_pred = model.predict(X_train_feature)




    # accuracy = accuracy_score(y_test, y_pred)
    # print(accuracy)


    # model score with training data 
    # accuracy, precision, recall, f1 = evaluate_model(y_test, y_pred)
    print("Model Training Performance")
    # print(model_name)
    # print("Accuracy : ",accuracy)
    # print("Precision : ", precision)
    # print("Recall : ", recall)
    # print("F1 Score : ", f1)
    # print(""*35)
    # print("\n")

    # accuracy_score.append(accuracy)
    # models_list.append(model_name)



Model Training Performance
Model Training Performance
Model Training Performance
Model Training Performance
Model Training Performance
Model Training Performance
Model Training Performance


TypeError: Sparse data was passed for X, but dense data is required. Use '.toarray()' to convert to a dense numpy array.