#### 1) Import Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, train_test_split
from pandas.plotting import scatter_matrix
import gzip
import shutil

with open("fiverr_data.csv", "rb") as f_in:
    with gzip.open("fiverr_data.csv.gz", "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# Import Dataset
fiverr_df = pd.read_csv("fiverr_data.csv.gz", compression='gzip')

#### 2) Perform basic EDA

In [None]:
fiverr_df.head()

In [None]:
fiverr_df.info()

In [None]:
fiverr_df.describe()

##### 2.1) Missing Value Check

In [None]:
# Check missing values
fiverr_df.isnull().sum().sort_values(ascending=False)

In [None]:
# X13 has missing value fill with mode
fiverr_df.X13.fillna(fiverr_df.X13.mode()[0],inplace=True)

##### 2.2) Dublicate Check

In [None]:
# Checking for duplicate values
fiverr_df.duplicated().sum()

No dublicate data 

#### 2.3) Remove user id column as that is unique id

In [None]:
fiverr_df.drop(columns=['user_id'], inplace=True)

In [None]:
fiverr_df.head()

### 3) Data Visualization
#### 3.1) Check feature correlation

In [None]:
# Using heatmap to see the relationships
plt.figure(figsize=(30, 16))
sns.heatmap(fiverr_df.corr(), annot = True, cmap = "coolwarm", center=0,linewidths=0.2,cbar_kws={"shrink": 0.8})
plt.xticks(rotation=90, fontsize=8)
plt.yticks(rotation=0, fontsize=8)
plt.tight_layout()
plt.show()

Column X27, X29, X30, X33, X46, X47, X48 are constant column and thus we can remove

In [None]:
fiverr_df = fiverr_df.drop(columns=['X27', 'X29', 'X30','X33', 'X46', 'X47', 'X48'])

#### 3.2) Feature distribution to check skewness

In [None]:
x = fiverr_df.drop("label", axis=1)
y = fiverr_df["label"]

x.hist(figsize=(30, 20))
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()

In [None]:
fiverr_df.skew().sort_values(ascending=False)

#### 3.3) Check outlier using box-plot

In [None]:
# Check/Treat the outliers and do the feature scaling if required.
# Using box plot to see the distribution of each feature
import math

plt.figure(figsize=(20, 20))

numerical_cols = x.select_dtypes(include = ['number']).columns
n_cols = len(numerical_cols)

rows = math.ceil(n_cols / 3) # Calculate rows needed for 3 columns

for i, col in enumerate(numerical_cols):
    plt.subplot(rows, 3, i + 1)
    sns.boxplot(x=x[col])
    plt.title(col)

plt.tight_layout()
plt.show()

#### 5.2) Feature Scaling using StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

#### 5.3) Split data into train/test sets (use stratified sampling while splitting)

In [None]:
y = fiverr_df["label"]

# Train And Test split
# x_train, x_test, y_train, y_test - This is used for Tree based model
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, train_size = 0.75, random_state = 1)

#### 5.3) Model Building and Evaluation

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Initialize
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Training And Testing
train_test_results = []

for name, model in models.items():
    model.fit(x_train, y_train)
       
    # Training performance
    y_train_pred = model.predict(x_train)
    train_acc = accuracy_score(y_train, y_train_pred)

    # Testing performance
    y_test_pred = model.predict(x_test)
    test_acc = accuracy_score(y_test, y_test_pred)
   
    # Overfitting condition (threshold = 5%)
    overfitting = "Overfitting" if (train_acc - test_acc) > 0.05 else "Good Generalization"

    # Classification report (macro average)
    report = classification_report(y_test, y_test_pred, output_dict=True)
    
    train_test_results.append({
        "Model": name,
        "Training Accuracy": round(train_acc, 4),
        "Testing Accuracy": round(test_acc, 4),
        "Precision (Macro Avg)": round(report["macro avg"]["precision"], 4),
        "Recall (Macro Avg)": round(report["macro avg"]["recall"], 4),
        "F1-Score (Macro Avg)": round(report["macro avg"]["f1-score"], 4),
        "Overfitting Status": overfitting
    })

# Create summary DataFrame
evaluation_table = pd.DataFrame(train_test_results)

evaluation_table

#### Overall Best Model: Random Forest

1) Highest F1-Score (0.8432) â†’ best balance
2) Highest Precision
3) Very strong Test Accuracy
4) Good generalization (no overfitting

#### 5.4) Confusion Matrix

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_test)

print("Confusion Matrix:\n", cm)
# Plot
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=np.unique(y_test),
    yticklabels=np.unique(y_test)
)

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

In [24]:
import joblib
# Save the model
joblib.dump(final_model, 'spam_detection_model.pkl', compress=7)
# Save the scaler
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']