# TF - IDF Implementation

######  TF-IDF (Term Frequency-Inverse Document Frequency) is a numerical statistic used to reflect how important a word is to a document in a collection or corpus.
# 
######  - Term Frequency (TF): Measures how frequently a term appears in a document.
######  - Inverse Document Frequency (IDF): Measures how important a term is by weighing down the frequent terms while scaling up the rare ones.
# 
######  The TF-IDF value increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus.
 
###### TF-IDF is widely used in information retrieval and text mining for tasks like keyword extraction and document similarity.

In [3]:
import pandas as pd
message = pd.read_csv('ham_spam_1000.csv',names=['label','message'])

In [4]:
message

Unnamed: 0,label,message
0,label,message
1,ham,Can you help me with the math homework?
2,spam,"Dear user, get rich quick with our crypto system!"
3,ham,Your order has been shipped and will arrive soon.
4,spam,Free entry to the VIP club for today only.
...,...,...
996,spam,You’ve been chosen for a free cruise to the Ba...
997,ham,Can we reschedule our call to tomorrow?
998,ham,Happy birthday! Hope you have a great day.
999,spam,This message is from the IRS. Pay now or face ...


In [7]:
# Data cleaning
import re
import nltk
from nltk.stem import  WordNetLemmatizer
from nltk.corpus import stopwords

wnl = WordNetLemmatizer()

In [9]:
corpus = []
for i in range(len(message)):
    review = re.sub('[^a-zA-Z]',' ', message['message'][i])
    review = review.lower()
    review = review.split()
    review = [ wnl.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [10]:
corpus

['message',
 'help math homework',
 'dear user get rich quick crypto system',
 'order shipped arrive soon',
 'free entry vip club today',
 'lunch new cafe',
 'claim free reward point',
 'pm',
 'message irs pay face arrest',
 'congratulation lucky winner ipad',
 'meeting postponed pm',
 'exclusive deal buy one get one free today',
 'lottery claim',
 'hey still tonight',
 'forget submit assignment',
 'help math homework',
 'earn money sleep start today',
 'dear user get rich quick crypto system',
 'lowest price medicine order without prescription',
 'happy birthday hope great day',
 'let finish project today',
 'act fast hour left claim prize',
 'exclusive deal buy one get one free today',
 'congratulation lucky winner ipad',
 'lowest price medicine order without prescription',
 'exclusive deal buy one get one free today',
 'exclusive deal buy one get one free today',
 'chosen free cruise bahamas',
 'netflix account suspended login fix',
 'free entry vip club today',
 'act fast hour left

# create TF IDF and N-gram


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000)

# Use fit_transform() to learn vocabulary and transform corpus
X = tfidf.fit_transform(corpus).toarray()


In [14]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(1001, 100))

In [17]:
for i in X:
    print(i)


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.57735027 0.         0.
 0.57735027 0.         0.         0.         0.         0

In [18]:
tfidf = TfidfVectorizer(max_features=1000,ngram_range=(2,2))
x = tfidf.fit_transform(corpus).toarray()
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(1001, 124))

In [19]:
tfidf.vocabulary_

{'help math': np.int64(52),
 'math homework': np.int64(71),
 'dear user': np.int64(27),
 'user get': np.int64(116),
 'get rich': np.int64(49),
 'rich quick': np.int64(99),
 'quick crypto': np.int64(94),
 'crypto system': np.int64(24),
 'order shipped': np.int64(86),
 'shipped arrive': np.int64(102),
 'arrive soon': np.int64(4),
 'free entry': np.int64(44),
 'entry vip': np.int64(35),
 'vip club': np.int64(117),
 'club today': np.int64(19),
 'lunch new': np.int64(70),
 'new cafe': np.int64(81),
 'claim free': np.int64(15),
 'free reward': np.int64(45),
 'reward point': np.int64(98),
 'message irs': np.int64(76),
 'irs pay': np.int64(59),
 'pay face': np.int64(89),
 'face arrest': np.int64(37),
 'congratulation lucky': np.int64(21),
 'lucky winner': np.int64(69),
 'winner ipad': np.int64(121),
 'meeting postponed': np.int64(74),
 'postponed pm': np.int64(90),
 'exclusive deal': np.int64(36),
 'deal buy': np.int64(26),
 'buy one': np.int64(9),
 'one get': np.int64(85),
 'get one': np.int6