In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('mail_data.csv')

### EDA

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df.shape

(5572, 2)

In [6]:
df.nunique()

Category       2
Message     5157
dtype: int64

In [7]:
df['Category'].unique()

array(['ham', 'spam'], dtype=object)

In [8]:
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

### Preprocessing

In [9]:
df.isna().sum()

Category    0
Message     0
dtype: int64

In [10]:
df['Category'] = df['Category'].apply(lambda x: 0 if x == 'spam' else 1)

In [11]:
df.Category.value_counts()

1    4825
0     747
Name: Category, dtype: int64

### Splitting

In [12]:
x = df['Message']
y = df['Category']

In [13]:
x.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [14]:
y.head()

0    1
1    1
2    0
3    1
4    1
Name: Category, dtype: int64

In [15]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [16]:
print(x_train.shape)
print(x_test.shape)

(4457,)
(1115,)


### Feature extraction

In [17]:
fe = TfidfVectorizer(min_df=1, stop_words='english',lowercase='True')

x_train_features = fe.fit_transform(x_train)
x_test_features = fe.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [18]:
print(x_train_features)

  (0, 5818)	0.22682143517864364
  (0, 2497)	0.2442158912653505
  (0, 694)	0.3171299579602537
  (0, 6264)	0.1898892037332199
  (0, 5800)	0.17558937755823417
  (0, 3262)	0.33791755486732394
  (0, 2049)	0.3034375179183143
  (0, 7300)	0.24288153842988894
  (0, 2724)	0.3544175987866074
  (0, 354)	0.3544175987866074
  (0, 7162)	0.2550284465664535
  (0, 258)	0.2379428657041507
  (0, 7222)	0.2173884735352799
  (0, 5512)	0.1898892037332199
  (1, 2555)	0.3840709491751004
  (1, 3804)	0.1902902346515268
  (1, 3932)	0.24325511357721427
  (1, 4509)	0.4028245991060671
  (1, 2440)	0.33870544648398715
  (1, 3333)	0.20665394084233096
  (1, 5650)	0.360444144470318
  (1, 2335)	0.2162321275166079
  (1, 6738)	0.28986069568918
  (1, 6109)	0.3239762634465801
  (1, 3267)	0.2678713077029217
  :	:
  (4452, 2438)	0.4574160733416501
  (4452, 7280)	0.3968991650168732
  (4452, 3978)	0.4574160733416501
  (4452, 3290)	0.26370969643076225
  (4452, 3084)	0.22948428918295163
  (4452, 2236)	0.2676662072392096
  (4453, 387

### Model

In [19]:
model = LogisticRegression()

In [20]:
model.fit(x_train_features,y_train)

LogisticRegression()

In [21]:
"""model_rf = RandomForestClassifier()
model_rf.fit(x_train_features,y_train)"""

'model_rf = RandomForestClassifier()\nmodel_rf.fit(x_train_features,y_train)'

In [22]:
"""model_dt = DecisionTreeClassifier()
model_dt.fit(x_train_features,y_train)"""

'model_dt = DecisionTreeClassifier()\nmodel_dt.fit(x_train_features,y_train)'

In [23]:
"""model_nb = MultinomialNB()
model_nb.fit(x_train_features,y_train)"""

'model_nb = MultinomialNB()\nmodel_nb.fit(x_train_features,y_train)'

In [24]:
'''model_gb = GradientBoostingClassifier()
model_gb.fit(x_train_features,y_train)'''

'model_gb = GradientBoostingClassifier()\nmodel_gb.fit(x_train_features,y_train)'

In [25]:
'''model_svc = SVC()
model_svc.fit(x_train_features,y_train)'''

'model_svc = SVC()\nmodel_svc.fit(x_train_features,y_train)'

### Prediction and Accuracy

In [26]:
prediction = model.predict(x_train_features)
accuracy = accuracy_score(y_train,prediction)
print(accuracy)
prediction_test = model.predict(x_test_features)
accuracy_test = accuracy_score(y_test,prediction_test)
print(accuracy_test)

#BEST MODEL

0.9661207089970832
0.967713004484305


In [27]:
'''prediction = model_rf.predict(x_train_features)
accuracy = accuracy_score(y_train,prediction)
print(accuracy)
prediction_test = model_rf.predict(x_test_features)
accuracy_test = accuracy_score(y_test,prediction_test)
print(accuracy_test)'''

'prediction = model_rf.predict(x_train_features)\naccuracy = accuracy_score(y_train,prediction)\nprint(accuracy)\nprediction_test = model_rf.predict(x_test_features)\naccuracy_test = accuracy_score(y_test,prediction_test)\nprint(accuracy_test)'

In [28]:
'''prediction = model_dt.predict(x_train_features)
accuracy = accuracy_score(y_train,prediction)
print(accuracy)
prediction_test = model_dt.predict(x_test_features)
accuracy_test = accuracy_score(y_test,prediction_test)
print(accuracy_test)'''

'prediction = model_dt.predict(x_train_features)\naccuracy = accuracy_score(y_train,prediction)\nprint(accuracy)\nprediction_test = model_dt.predict(x_test_features)\naccuracy_test = accuracy_score(y_test,prediction_test)\nprint(accuracy_test)'

In [29]:
'''prediction = model_nb.predict(x_train_features)
accuracy = accuracy_score(y_train,prediction)
print(accuracy)
prediction_test = model_nb.predict(x_test_features)
accuracy_test = accuracy_score(y_test,prediction_test)
print(accuracy_test)'''

'prediction = model_nb.predict(x_train_features)\naccuracy = accuracy_score(y_train,prediction)\nprint(accuracy)\nprediction_test = model_nb.predict(x_test_features)\naccuracy_test = accuracy_score(y_test,prediction_test)\nprint(accuracy_test)'

In [30]:
'''prediction = model_svc.predict(x_train_features)
accuracy = accuracy_score(y_train,prediction)
print(accuracy)
prediction_test = model_svc.predict(x_test_features)
accuracy_test = accuracy_score(y_test,prediction_test)
print(accuracy_test)'''

'prediction = model_svc.predict(x_train_features)\naccuracy = accuracy_score(y_train,prediction)\nprint(accuracy)\nprediction_test = model_svc.predict(x_test_features)\naccuracy_test = accuracy_score(y_test,prediction_test)\nprint(accuracy_test)'

In [31]:
'''prediction = model_gb.predict(x_train_features)
accuracy = accuracy_score(y_train,prediction)
print(accuracy)
prediction_test = model_gb.predict(x_test_features)
accuracy_test = accuracy_score(y_test,prediction_test)
print(accuracy_test)'''

'prediction = model_gb.predict(x_train_features)\naccuracy = accuracy_score(y_train,prediction)\nprint(accuracy)\nprediction_test = model_gb.predict(x_test_features)\naccuracy_test = accuracy_score(y_test,prediction_test)\nprint(accuracy_test)'

In [32]:
input_mail = ["Hello,Are you ready to demonstrate your SQL mastery? Join the electrifying DataRush Coding Contest hosted by Newton School, where top data scientists and SQL aficionados clash!"]

input_data_features = fe.transform(input_mail)

pred = model.predict(input_data_features)

In [33]:
if pred == 1:
    print('Ham Mail')
else:
    print('Spam Mail')

Ham Mail
