In [2]:
# Feature Extraction From Text

# most machine learning algorithms can't take in raw text
# instead, we need to perform a feature extraction from raw text
# to pass numerical features to machine learning algorithm

In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv("./004 UPDATED-NLP-COURSE/UPDATED_NLP_COURSE/TextFiles/smsspamcollection.tsv" , sep = "\t")

In [5]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [6]:
df["label"].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split

In [10]:
X = df["message"]

In [11]:
y = df["label"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
count_vect = CountVectorizer()

In [18]:
# Fit vectorizer to data (build a vocab , count the number of words)

# c unt_vect.fit(X_train)
# X_train_counts  = count_vect.transform(X_train)
# transform the original text message to a vector

X_train_counts = count_vect.fit_transform(X_train)

In [19]:
X_train_counts

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [20]:
X_train.shape

(3733,)

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer

In [22]:
tfidf_transformer = TfidfTransformer()

In [23]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [24]:
X_train_tfidf.shape

(3733, 7082)

In [25]:
# we can also do the above two steps in one step only using TfidfVectorizer

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
vectorizer = TfidfVectorizer()

In [28]:
X_train_tfidf = vectorizer.fit_transform(X_train)

In [29]:
from sklearn.svm import LinearSVC

In [30]:
clf  = LinearSVC()

In [31]:
clf.fit(X_train_tfidf , y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [32]:
# we can create a pipeline to apply the vectorization process on train and test dataset
# multiple steps in single step

In [33]:
from sklearn.pipeline import Pipeline

In [34]:
text_clf = Pipeline([("tfidf" , TfidfVectorizer()) , ("clf" , LinearSVC())])

In [35]:
text_clf.fit(X_train , y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [37]:
pred = text_clf.predict(X_test)

In [38]:
from sklearn.metrics import confusion_matrix, classification_report

In [39]:
print(confusion_matrix(y_test , pred))

[[1586    7]
 [  12  234]]


In [40]:
print(classification_report(y_test , pred))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [41]:
from sklearn import metrics

In [42]:
# we are getting almost 99% accuracy

metrics.accuracy_score(y_test , pred)

0.989668297988037

In [43]:
# checking for a message

text_clf.predict(["Hi, how are you doing today"])

array(['ham'], dtype=object)

In [47]:
text_clf.predict(["Congratulations! you have been selected as a winner. And you won a free ticket to contest"])

array(['spam'], dtype=object)