In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from imblearn.over_sampling import SMOTE
import joblib

df=pd.read_csv('../data/spam.csv',encoding='latin-1')

df=df[['v1','v2']]
df=df.rename(columns={'v1':'label','v2':'text'})
df=df.drop_duplicates()

In [3]:
df['text_size']=df['text'].apply(len)


In [4]:
la=LabelEncoder()
df['label']=la.fit_transform(df['label'])


In [5]:
def clean_text(text):
    text=text.lower()
    text=re.sub(r'\d+',' ',text)
    text=re.sub(r'\W',' ',text)
    text=re.sub(r'\s+',' ',text).strip()
    return text
df['text']=df['text'].apply(clean_text)

In [6]:
nltk.download('stopwords')
nltk.download('wordnet')
stopword=set(stopwords.words('english'))
lemmetiser=WordNetLemmatizer()

def lemmatise(text):
    words=[lemmetiser.lemmatize(i) for i in text.split() if i not in stopword]
    return " ".join(words)
df['text']=df['text'].apply(lemmatise)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
vectorizer=TfidfVectorizer()
text=vectorizer.fit_transform(df['text'])
size=df['text_size'].values.reshape(-1,1)
X=hstack([text,size])
y=df['label']

In [8]:
smote=SMOTE(random_state=2)
x_sm,y_sm=smote.fit_resample(X,y)


In [9]:
print("Before SMOTE:\n", y.value_counts())
print("After SMOTE:\n", y_sm.value_counts())


Before SMOTE:
 label
0    4516
1     653
Name: count, dtype: int64
After SMOTE:
 label
0    4516
1    4516
Name: count, dtype: int64


In [10]:
joblib.dump(x_sm,'../data/feature.pkl')
joblib.dump(y_sm,'../data/label.pkl')
joblib.dump(vectorizer,'../data/vectorizer.pkl')

['../data/vectorizer.pkl']