## Tratamento dos dados

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("../data/Phishing_Email.csv", encoding='latin-1')

df

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email
...,...,...,...
18645,18646,date a lonely housewife always wanted to date ...,Phishing Email
18646,18647,request submitted : access request for anita ....,Safe Email
18647,18648,"re : important - prc mtg hi dorn & john , as y...",Safe Email
18648,18649,press clippings - letter on californian utilit...,Safe Email


In [2]:
df = df.drop(columns=["Unnamed: 0"])
df = df.rename(columns={"Email Text": "texto"})
print(df["Email Type"].value_counts())
print("Total: ", df["Email Type"].value_counts().sum())

Email Type
Safe Email        11322
Phishing Email     7328
Name: count, dtype: int64
Total:  18650


In [3]:
df["texto"].replace('empty',np.nan,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["texto"].replace('empty',np.nan,inplace=True)


In [4]:
df = df.dropna()
print(df.value_counts().sum())
df

18101


Unnamed: 0,texto,Email Type
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email
3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,software at incredibly low prices ( 86 % lower...,Phishing Email
...,...,...
18644,\nRick Moen a ÃÂ©crit:> > I'm confused. I th...,Safe Email
18645,date a lonely housewife always wanted to date ...,Phishing Email
18646,request submitted : access request for anita ....,Safe Email
18647,"re : important - prc mtg hi dorn & john , as y...",Safe Email


In [5]:
df["texto"] = df["texto"].str.lower()

df = df.drop_duplicates(subset='texto', keep='first')
print(df.value_counts().sum())

17536


In [6]:
df["Email Type"] = df["Email Type"].map({"Safe Email": 0, "Phishing Email": 1})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Email Type"] = df["Email Type"].map({"Safe Email": 0, "Phishing Email": 1})


In [7]:
import unicodedata
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return ''.join([c for c in nfkd_form if not unicodedata.category(c) == 'Mn'])

def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word not in ENGLISH_STOP_WORDS])

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = remove_accents(text)
    text = remove_stopwords(text)
    return text.strip()

df["texto"] = df["texto"].apply(preprocess_text)

print(df.head())


                                               texto  Email Type
0  6 1100 disc uniformitarianism 1086 sex lang di...           0
1  galicismos galicismo spanish term names improp...           0
2  equistar deal tickets available assist robert ...           0
3  hello hot lil horny toy dream open minded pers...           1
4  software incredibly low prices 86 lower draper...           1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["texto"] = df["texto"].apply(preprocess_text)


In [8]:
from gensim.models import Word2Vec

def word2vec(textos_tokenizados, vector_size=200, window=6, min_count=2):
    model = Word2Vec(
        sentences=textos_tokenizados,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=1,
        workers=4
    )
    return model

def vetor_medio(texto, model):
    palavras = texto.split()
    vetores = [model.wv[p] for p in palavras if p in model.wv]
    if vetores:
        return np.mean(vetores, axis=0)
    else:
        return np.zeros(model.vector_size)

def word2vec_transform(df, model):
    vetores = df["texto"].apply(lambda x: vetor_medio(x, model))
    matriz = np.vstack(vetores.values)

    df_w2v = pd.DataFrame(matriz, columns=[f"w2v_{i}" for i in range(model.vector_size)])
    df_w2v["Email Type"] = df["Email Type"].values

    return df_w2v, model

df["tokens"] = df["texto"].apply(lambda x: x.lower().split())

modelo_w2v = word2vec(df["tokens"].tolist())

df_final, modelo_w2v = word2vec_transform(df, modelo_w2v)

print(df_final.head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tokens"] = df["texto"].apply(lambda x: x.lower().split())


      w2v_0     w2v_1     w2v_2     w2v_3     w2v_4     w2v_5     w2v_6  \
0  0.139022  0.029635  0.072710  0.083856  0.139819 -0.202052  0.078245   
1  0.208390  0.044874 -0.061496  0.114951  0.121517 -0.093282  0.142455   
2 -0.092116  0.152545 -0.059041 -0.110457  0.024706 -0.400563  0.198103   
3  0.121433  0.040305  0.033057  0.004016  0.264080 -0.244418 -0.002423   
4  0.218585  0.069286  0.118228 -0.022735  0.248860 -0.198565  0.078934   

      w2v_7     w2v_8     w2v_9  ...   w2v_191   w2v_192   w2v_193   w2v_194  \
0  0.491132 -0.107505 -0.063613  ... -0.018377 -0.078192 -0.147558 -0.027903   
1  0.448644 -0.159832 -0.109935  ... -0.092825 -0.045952 -0.137861 -0.078458   
2  0.521491  0.017115 -0.213225  ...  0.063414 -0.064814 -0.432174 -0.110270   
3  0.366948 -0.020339 -0.048855  ...  0.078867 -0.101572 -0.197652 -0.003938   
4  0.485689 -0.111653 -0.027804  ...  0.059646 -0.078572 -0.229225 -0.013285   

    w2v_195   w2v_196   w2v_197   w2v_198   w2v_199  Email Type  
0 

In [9]:
# Save the final DataFrame to a CSV file
df_final.to_csv("../data/df_processed.csv", index=False)
modelo_w2v.save("../models/models_w2v/vmodelo_w2v.model")
