In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("./smsspamcollection.tsv",sep="\t")

In [4]:
df.head()#confirm that the data is loaded by using df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [5]:
#nonw, we should take the raw text information and vectorize it
#but before it, we need to check, if they are null

In [6]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [9]:
df['label'].value_counts()


ham     4825
spam     747
Name: label, dtype: int64

In [10]:
#split data into training set and test set

In [11]:
from sklearn.model_selection import train_test_split

In [14]:
X = df['message']
X

Unnamed: 0,message
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."
...,...
5567,This is the 2nd time we have tried 2 contact u...
5568,Will ü b going to esplanade fr home?
5569,"Pity, * was in mood for that. So...any other s..."
5570,The guy did some bitching but I acted like i'd...


In [13]:
y= df['label']
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [15]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [16]:
#perform count vectorisation
#text preprocessing, tokenizing, and ability to filter out stop words, are all included
#in vectorisation
#which builds a dictionary of features, and transforms the documents into feature vectors

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
count_vect = CountVectorizer()

In [26]:
X
#X is still in raw text form
#we want to pass X into countvectorizer and transform X

#two ways
#Fit vectorizer, to the data (build a voacbulary,count the number of words..)

# count_vect.fit(X_train)
#building a vocabulary, by counting the words

# X_train_counts = count_vect.transform(X_train)
#and then doing the transformation

# X_train_counts

#Transform the original Text Message --> VECTOR
#since we do fit and transform always
#there is a convenience method , that scikit learn has, it is , fit_transform
#it does the two steps , in one step
X_train_counts  = count_vect.fit_transform(X_train)

In [29]:
X_train_counts
#here we, cannot read it , since it is  a huge sparse matrix

#scikit-learn , will compress it, into a compressed sparsed row format


<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [33]:
X_train.shape
#there were 7082 unique words
#a lot of those words, wont show up in every sentence, so they are 0 
#because of this, scikit-learn and numpy is able to compress and save space

(3733,)

In [34]:
X_train_counts.shape


(3733, 7082)

In [36]:
#next ,we want to transform the counts into frequencies  , with TFIDF
# and we will combine the steps with the vectorizer, and 
#we will train the classifier and build the pipeline

#so far, we read the data, we imported , it
# we did train, test split
# we did count vectorisation ,and fit transform on the training data
#in next lec, we will combine these with term fequency and inverse document frequency analyisis

In [37]:
#as we discussed, we can use TFIDF, to get words which are more important
#one way to do this, is simply passsing in , our count_vectorisatino, into a tfidf transofrmer


In [38]:
from sklearn.feature_extraction.text import TfidfTransformer

In [39]:
tfidf_transformer = TfidfTransformer()

In [40]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [41]:
X_train_tfidf.shape

(3733, 7082)

In [42]:
#since it is more common to pass a count_vectoriser to the tfidftransformer
#scikit learn, combines the two steps, as tfidf vectoriser

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
vectorizer = TfidfVectorizer()

In [45]:
X_train_tfidf = vectorizer.fit_transform(X_train)

In [46]:
#next ,we are going to train classifier

In [47]:
from sklearn.svm import LinearSVC


In [48]:
clf = LinearSVC()

In [50]:
#fit it to vectorised training data and pass the y_train labels 
clf.fit(X_train_tfidf,y_train)

LinearSVC()

In [51]:
#now only our training set, has been vectorised into vocabulary
#inorder to perform analysis on test data, we need to repeat it
#it becomes tiresome
#scikit learn provides, is a pipeline class, that behaves as a compound classifier
#that performs vectorisation and classification


In [52]:
#so instead of doing, fit_Transform and count_vectorisation, inorder to predict
#we can convert into one pipeline step

In [53]:
from sklearn.pipeline import Pipeline

In [54]:
#pipeline, object, will take list of tuples
#each tuple , has the string name , that you have to decide
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [55]:
#this pipeline object, behaves as normal classifier of  scikit  learn of normal ML mode
# and what is convenient of the pipeline object is, that we can do in all these steps in one step 

In [56]:
#that means, we directly provide, the X_Train ,and have a vectoriser,  
#and run classifier on it


In [57]:
# we can sometimes, have a long popeline, like ,removing stop words, lemmatisation, 
#tokenisation, text feature extraction etc

In [58]:
#so the pipeline is convenient to do this, but rightnow we have a small pipeline, 

In [59]:
#so the way to use a pipeline,is like a normal ML model

In [60]:
text_clf.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [65]:
#we can test the classifier and display the results
predictions = text_clf.predict(X_test)


In [68]:
X_test
#raw text messages

3245    Squeeeeeze!! This is christmas hug.. If u lik ...
944     And also I've sorta blown him off a couple tim...
1044    Mmm thats better now i got a roast down me! i...
2484        Mm have some kanji dont eat anything heavy ok
812     So there's a ring that comes with the guys cos...
                              ...                        
4944    Check mail.i have mailed varma and kept copy t...
3313    I know you are serving. I mean what are you do...
3652         Want to send me a virtual hug?... I need one
14                    I HAVE A DATE ON SUNDAY WITH WILL!!
4758    hey, looks like I was wrong and one of the kap...
Name: message, Length: 1839, dtype: object

In [66]:
predictions

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [69]:
from sklearn.metrics import confusion_matrix,classification_report

In [70]:
print(confusion_matrix(y_test,predictions))

[[1586    7]
 [  12  234]]


In [71]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

    accuracy                           0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [72]:
from sklearn import metrics

In [73]:
metrics.accuracy_score(y_test,predictions)

0.989668297988037

In [74]:
#if we want to predict on a new message,  we use  the same predict mehtod

In [75]:
text_clf.predict(["Hi how are you doing today?"])

array(['ham'], dtype=object)

In [77]:
text_clf.predict(["Congratulations! You've been selected as a winner. TEXT won to 44255 congratulations free entry to contest"])

array(['spam'], dtype=object)

In [83]:
text_clf.predict([" hi  to contest"])

array(['ham'], dtype=object)