In [26]:
import pandas as pd

## Dataset #1

In [27]:
BASE_PATH = r"C:\Users\prati\Downloads\wiseconnect datasets"
df1 = pd.read_csv(BASE_PATH + r"\Dataset_5971.csv")[["LABEL", "TEXT"]]

In [28]:
df1["LABEL"] = df1["LABEL"].str.replace("Spam", "spam").replace("Smishing", "smishing")
df1 = df1[df1["LABEL"] != "smishing"]
df1.columns = ["category", "text"]
df1.head()

Unnamed: 0,category,text
0,ham,Your opinion about me? 1. Over 2. Jada 3. Kusr...
1,ham,What's up? Do you want me to come online? If y...
2,ham,So u workin overtime nigpun?
3,ham,"Also sir, i sent you an email about how to log..."
6,ham,Sorry dude. Dont know how i forgot. Even after...


## Dataset #2

In [29]:
df2 = pd.read_csv(BASE_PATH + r"\text-messages.csv")[["category", "text"]]

In [30]:
df2 = df2[df2["text"] != "<REDACTED>"]
df2["category"] = df2["category"].apply(lambda x: "spam" if x == "spam" else "ham")
df2.head()

Unnamed: 0,category,text
0,spam,"Make money while watching YouTube, earn 500P p..."
1,ham,Get up to P125K extra funds for emergencies wi...
9,ham,G to upgrade your home! Just use GCredit to pa...
10,ham,Good news! You can still pay with GGives at yo...
11,ham,Get up to P125K extra funds for emergencies wi...


## Dataset #3

In [31]:
df3 = pd.read_csv(BASE_PATH + r"\spam.csv")[["v1", "v2"]]

In [32]:
df3.columns = ["category", "text"]

## Merge all datasets

In [33]:
dfs = [df1, df2, df3]

In [34]:
final_df = pd.concat(dfs, axis=0)
final_df.shape

(11726, 2)

In [35]:
final_df.groupby("category").count()

Unnamed: 0_level_0,text
category,Unnamed: 1_level_1
ham,10047
spam,1679


In [36]:
import string
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    text = text.split()
    text = " ".join([token for token in text if token not in stop_words])
    return text

final_df["text"] = final_df["text"].apply(clean_text)
final_df.head()

Unnamed: 0,category,text
0,ham,opinion 1 2 jada 3 kusruthi 4 lovable 5 silent...
1,ham,whats want come online free talk sometime�
2,ham,u workin overtime nigpun
3,ham,also sir sent email log usc payment portal ill...
6,ham,sorry dude dont know forgot even dan reminded ...


In [37]:
final_df.to_csv("fraud_text.csv", index=False)

## Train models

In [38]:
final_df["category"] = final_df["category"].apply(lambda x: 1 if x == "spam" else 0)
final_df.columns = ["is_fraud", "text"]
final_df.head()

Unnamed: 0,is_fraud,text
0,0,opinion 1 2 jada 3 kusruthi 4 lovable 5 silent...
1,0,whats want come online free talk sometime�
2,0,u workin overtime nigpun
3,0,also sir sent email log usc payment portal ill...
6,0,sorry dude dont know forgot even dan reminded ...


In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(final_df["text"], final_df["is_fraud"], test_size=0.33)

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9775193798449613


In [43]:
import pickle
with open("./LogisticReg.pkl", "wb") as fp:
    pickle.dump((vectorizer, classifier), fp)