In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [2]:
message = pd.read_csv('SMSSpamCollection', sep='\t',names=['label','message'],encoding="latin1")

In [3]:
message

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
data=message.where((pd.notnull(message)),'')

In [5]:
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
data['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [8]:
data.shape

(5572, 2)

In [9]:
data['label']=data['label'].map({"spam":0,"ham":1})

In [10]:
data

Unnamed: 0,label,message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [11]:
x=data['message']
y=data['label']

In [12]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: label, Length: 5572, dtype: int64

In [13]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3)

In [14]:
x_train.shape , x_test.shape , y_test.shape , y_train.shape

((4457,), (1115,), (1115,), (4457,))

In [15]:
categorical_to_numerical=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

In [16]:
new_xtrain=categorical_to_numerical.fit_transform(x_train)

In [17]:
new_xtrain

<4457x7435 sparse matrix of type '<class 'numpy.float64'>'
	with 34824 stored elements in Compressed Sparse Row format>

In [18]:
new_xtest=categorical_to_numerical.transform(x_test)

In [19]:
model=LogisticRegression()

In [20]:
model.fit(new_xtrain,y_train)

LogisticRegression()

In [21]:
model.score(new_xtrain,y_train)

0.9679156383217411

In [22]:
model.score(new_xtest,y_test)

0.9659192825112107

In [23]:
y_pred=model.predict(new_xtest)

In [24]:
y_pred

array([0, 1, 1, ..., 1, 1, 1])

In [25]:
new_xtest.shape

(1115, 7435)

In [26]:
model.predict(new_xtest[1110])

array([1])

In [27]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test,y_pred))

[[117  38]
 [  0 960]]


In [28]:
result={}

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm 
LR=LogisticRegression()
LR.fit(new_xtrain,y_train)
result['LogisticRegression']=LR.score(new_xtest,y_test)
RF=RandomForestClassifier(n_estimators=50, max_depth=5, random_state=1)
RF.fit(new_xtrain, y_train)
result['RandomForestClassifier']= RF.score(new_xtest, y_test)
NN=MLPClassifier(hidden_layer_sizes=(18,),random_state=1, max_iter=1000)
NN.fit(new_xtrain, y_train)
result['MLPClassifier'] = NN.score(new_xtest,y_test)
DTC=DecisionTreeClassifier()
DTC.fit(new_xtrain,y_train)
result['DecisionTreeClassifier'] =DTC.score(new_xtest,y_test)

In [30]:
pip install tabulate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [31]:
from tabulate import tabulate 

result= dict(sorted(result.items(), key=lambda x:x[1], reverse=True))
print(tabulate(result.items(), headers=['NAME','VALUE'], tablefmt="grid"))

+------------------------+----------+
| NAME                   |    VALUE |
| MLPClassifier          | 0.98565  |
+------------------------+----------+
| LogisticRegression     | 0.965919 |
+------------------------+----------+
| DecisionTreeClassifier | 0.963229 |
+------------------------+----------+
| RandomForestClassifier | 0.863677 |
+------------------------+----------+


In [32]:
from sklearn.neural_network import MLPClassifier

In [33]:
model2=MLPClassifier()

In [34]:
model2.fit(new_xtrain,y_train)

MLPClassifier()

In [35]:
model2.score(new_xtrain,y_train)

1.0

In [36]:
model2.score(new_xtest,y_test)

0.9856502242152466