In [15]:
import pandas as pd
import numpy as np
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from wordcloud import WordCloud
import nltk
import joblib

In [16]:
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [17]:
model = joblib.load("model.pkl")
print(model)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.95, max_features=5000, min_df=2,
                                 ngram_range=(1, 2))),
                ('clf',
                 CalibratedClassifierCV(cv=5,
                                        estimator=LinearSVC(C=1, max_iter=5000,
                                                            random_state=42)))])


In [18]:
data=pd.read_csv("DataSet_Emails.csv",index_col=0)

In [19]:
data=data.dropna(subset='text')

In [20]:
data = data.reset_index(drop=True)

In [21]:
data.head()

Unnamed: 0,message_id,text,label,label_text,subject,message,date
0,33214,any software just for 15 $ - 99 $ understandin...,1,spam,any software just for 15 $ - 99 $,understanding oem software\nlead me not into t...,2005-06-18
1,11929,perspective on ferc regulatory action client c...,0,ham,perspective on ferc regulatory action client c...,"19 th , 2 : 00 pm edt\nperspective on ferc reg...",2001-06-19
2,19784,wanted to try ci 4 lis but thought it was way ...,1,spam,wanted to try ci 4 lis but thought it was way ...,viagra at $ 1 . 12 per dose\nready to boost yo...,2004-09-11
3,2209,"enron / hpl actuals for december 11 , 2000 tec...",0,ham,"enron / hpl actuals for december 11 , 2000",teco tap 30 . 000 / enron ; 120 . 000 / hpl ga...,2000-12-12
4,15880,looking for cheap high - quality software ? ro...,1,spam,looking for cheap high - quality software ? ro...,"water past also , burn , course . gave country...",2005-02-13


In [22]:
def preprocess_text_func(text):
    text_1=str(text)
    text_1=text_1.lower()

    # Tokenisation (décompose le texte en tokens)
    tokens = word_tokenize(text_1)
    
    # Supprimer la ponctuation et les caractères spéciaux
    tokens = [re.sub(f'[{re.escape(string.punctuation)}]', '', token) for token in tokens]
    tokens = [token for token in tokens if token]  # supprimer les tokens vides après suppression ponctuation

    if stop_words is None:
        raise ValueError
    
    # Supprimer les stop_words
    tokens = [token for token in tokens if token not in stop_words]
    
    # Appliquer le stemming
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    #Join tolkens
    processed_text_str = ' '.join(stemmed_tokens)
    
    return processed_text_str

In [23]:
data['text_preprocessed']=data['text'].apply(preprocess_text_func)
data.head()

Unnamed: 0,message_id,text,label,label_text,subject,message,date,text_preprocessed
0,33214,any software just for 15 $ - 99 $ understandin...,1,spam,any software just for 15 $ - 99 $,understanding oem software\nlead me not into t...,2005-06-18,softwar 15 99 understand oem softwar lead temp...
1,11929,perspective on ferc regulatory action client c...,0,ham,perspective on ferc regulatory action client c...,"19 th , 2 : 00 pm edt\nperspective on ferc reg...",2001-06-19,perspect ferc regulatori action client conf ca...
2,19784,wanted to try ci 4 lis but thought it was way ...,1,spam,wanted to try ci 4 lis but thought it was way ...,viagra at $ 1 . 12 per dose\nready to boost yo...,2004-09-11,want tri ci 4 li thought way expens viagra 1 1...
3,2209,"enron / hpl actuals for december 11 , 2000 tec...",0,ham,"enron / hpl actuals for december 11 , 2000",teco tap 30 . 000 / enron ; 120 . 000 / hpl ga...,2000-12-12,enron hpl actual decemb 11 2000 teco tap 30 00...
4,15880,looking for cheap high - quality software ? ro...,1,spam,looking for cheap high - quality software ? ro...,"water past also , burn , course . gave country...",2005-02-13,look cheap high qualiti softwar rotat napoleon...


In [24]:
n=789

In [25]:
message_1=data.loc[n,"text"]
print(message_1)

wellhead desk - new headcount this week , the wellhead desk has done more than 150 deals . we need to discuss adding a headcount ( maybe 2 ) , as originally we were just integrating the book with the daily business . at the current pace , this is no longer an option . bob , will this require an additional headcount for logistics as well ? and , who needs to sign off on this on the commercial side ?
thanks for your help ,
jg


In [26]:
message=data.loc[n,"text_preprocessed"]
print(message)

wellhead desk new headcount week wellhead desk done 150 deal need discuss ad headcount mayb 2 origin integr book daili busi current pace longer option bob requir addit headcount logist well need sign commerci side thank help jg


In [27]:
label=data.loc[n,"label"]
print(label)

0


In [28]:
prediction = model.predict([message])
print(prediction)

[0]
