In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
raw_data = pd.read_csv('mail_data.csv')

In [3]:
raw_data


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
# convert all missing values with empty string
mail_data = raw_data.where((pd.notnull(raw_data)),'')

In [5]:
mail_data.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
#   cheackinf number of row x col
mail_data.shape

(5572, 2)

In [7]:
## Making spam to 0 and ham as 1
mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1

In [8]:
mail_data.head(5)

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
# basically divide it into X and Y
X = mail_data['Message']
Y = mail_data['Category']


In [33]:
# Spltting the data in traning and testing data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=3)

In [34]:
x_test.shape

(2229,)

In [35]:
# Transform the text data to vectors of Logistic regression
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

In [36]:
X_train_features = feature_extraction.fit_transform(x_train)
X_test_features = feature_extraction.transform(x_test)

In [37]:
# convert y test and y train to int
Y_train = y_train.astype('int')
Y_test = y_test.astype('int')

In [38]:
Y_train

3026    1
3749    1
4361    1
1999    1
455     0
       ..
789     0
968     1
1667    1
3321    1
1688    0
Name: Category, Length: 3343, dtype: int32

In [39]:
print(X_train_features)

  (0, 5833)	0.47301266759662697
  (0, 3001)	0.5930609208621515
  (0, 1974)	0.651565622511854
  (1, 2606)	0.28098328655723337
  (1, 2405)	0.27089961373415494
  (1, 1339)	0.18684991543758142
  (1, 6135)	0.2295860509768233
  (1, 3369)	0.14729557583426567
  (1, 3359)	0.1919885792961897
  (1, 2755)	0.4105804790733068
  (1, 5175)	0.28098328655723337
  (1, 5516)	0.2630781094434785
  (1, 2021)	0.23544153831792122
  (1, 2874)	0.28098328655723337
  (1, 5113)	0.21277632072528896
  (1, 5872)	0.40617436382169175
  (1, 1104)	0.20027434794604718
  (2, 5135)	0.35324635377667984
  (2, 2886)	0.35324635377667984
  (2, 4755)	0.30716571773325513
  (2, 2115)	0.35324635377667984
  (2, 2217)	0.35324635377667984
  (2, 3164)	0.1557079233579834
  (2, 5923)	0.21740192116360693
  (2, 5083)	0.47931761222359576
  :	:
  (3340, 2669)	0.3292178140178824
  (3340, 3924)	0.26907193452031153
  (3341, 2094)	0.3543816009282524
  (3341, 6050)	0.30859287299145854
  (3341, 4007)	0.31097601210364706
  (3341, 1903)	0.360415420031

In [40]:
print(X_test_features)

  (0, 6186)	0.1908191940113333
  (0, 5875)	0.2022387220198144
  (0, 4559)	0.23606589604701453
  (0, 4421)	0.19904876897174445
  (0, 3733)	0.18203630172607957
  (0, 1310)	0.26465228204998065
  (0, 1185)	0.32219418292510915
  (0, 1150)	0.2536463502195762
  (0, 925)	0.2505841344316126
  (0, 891)	0.28016427173155534
  (0, 341)	0.23606589604701453
  (0, 258)	0.24510948626685214
  (0, 16)	0.30668219324353446
  (0, 11)	0.26915833990115084
  (0, 6)	0.28016427173155534
  (0, 1)	0.2322369757641426
  (1, 6276)	0.3009379550013267
  (1, 5724)	0.4307408633775453
  (1, 5600)	0.32791153064443523
  (1, 5528)	0.2655032512678621
  (1, 5284)	0.3676913466782936
  (1, 4019)	0.23184178320825352
  (1, 3764)	0.3511812313098286
  (1, 2964)	0.4833269198021534
  (2, 6128)	0.22002917039490233
  :	:
  (2225, 1919)	0.33902458241025546
  (2225, 1647)	0.4404397426231508
  (2225, 288)	0.3938867001833902
  (2226, 5322)	0.5393863553598005
  (2226, 4207)	0.5880598753264416
  (2226, 2872)	0.6027005414654286
  (2227, 5862)	

In [50]:
# Training the model

model = LogisticRegression(solver='liblinear', penalty='l1')

In [51]:
# Triang the logistic regression model using data
model.fit(X_train_features, Y_train)

LogisticRegression(penalty='l1', solver='liblinear')

In [52]:
# Evaluating the model
prediction_training_data = model.predict(X_train_features)


In [53]:
print(prediction_training_data)

[1 1 1 ... 1 1 0]


In [54]:
accuracy_training_data = accuracy_score(Y_train, prediction_training_data)

In [55]:
print('Accuracy on training data: ', accuracy_training_data)

Accuracy on training data:  0.9611127729584206


In [56]:
predicting_testing_data = model.predict(X_test_features)
print(predicting_testing_data)

accuracy_test_data = accuracy_score(Y_test, predicting_testing_data)
print(accuracy_test_data)

[0 1 1 ... 1 1 1]
0.9533423059668013


In [67]:

inputval = ["flat 50% .This is to inform you you win a lottery of 1000000 rs and we would like you to know how you an retrieve your prize"]
input_data = feature_extraction.transform(inputval)

prediction = model.predict(input_data)
print(prediction)

[0]


In [65]:
val = input()

Member ID 197696522 : Celebrate with the brightest offers on latest tech


In [66]:
li = []
li.append(val)
input_data = feature_extraction.transform(li)

def prediction(x):
    if x == 0:
        print("Spam")
    else:
        print("Ham")
        
pre = model.predict(input_data)
prediction(pre)

Ham
