In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [39]:
df = pd.read_csv("../DATA/SMSSpamCollection.csv")

In [40]:
print(df.head())

  Label                                        SMS_Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [41]:
print(df.columns)

Index(['Label', 'SMS_Message'], dtype='object')


In [42]:
outdir = Path("../OUTPUT/01_Exploratory_Plots")
outdir.mkdir(parents=True, exist_ok=True)

In [43]:
missing_counts = df.isnull().sum()
plt.figure(figsize=(6,4))
sns.barplot(x=missing_counts.index, y=missing_counts.values, palette="Set2")
plt.title("Count of Missing Values per Column")
plt.ylabel("Number of Missing Values")
plt.xlabel("Columns")
plt.tight_layout()
plt.savefig(outdir / "Missing_Values.png", dpi=300)
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=missing_counts.index, y=missing_counts.values, palette="Set2")


In [44]:
plt.figure(figsize=(6,4))
sns.countplot(x="Label", data=df, palette="Set1")
plt.title("Class Distribution of SMS Messages")
plt.xlabel("Message Type")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(outdir / "Class_Distribution.png", dpi=300)
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x="Label", data=df, palette="Set1")


In [45]:
msg_lengths = df["SMS_Message"].apply(len)

plt.figure(figsize=(8,5))
sns.histplot(x=msg_lengths, hue=df["Label"], bins=50, kde=True, palette="Set1")
plt.title("Distribution of Message Lengths by Class")
plt.xlabel("Message Length (characters)")
plt.ylabel("Frequency")
plt.xlim(0, 300)
plt.tight_layout()
plt.savefig(outdir / "Message_Length_Distribution.png", dpi=300)
plt.close()