In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
!pip install pgmpy

from sklearn.gaussian_process import GaussianProcessClassifier,GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
import pandas as pd  # Import pandas library
from sklearn.decomposition import LatentDirichletAllocation
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import ParameterEstimator, BayesianEstimator
from sklearn.compose import ColumnTransformer
import numpy as np
from sklearn.linear_model import ElasticNet





# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/email.csv")



# Split the data into features (X) and labels (Y)
X = df['Message']
Y = (df['Category'] == 'spam').astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Define the Gaussian Process Classification pipeline
clf_gp = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('gp', GaussianProcessClassifier(kernel=RBF()))
])

# Convert sparse matrix to dense array
X_train_dense = clf_gp.named_steps['vectorizer'].fit_transform(X_train).toarray()

# Fit the Gaussian Process Classification model
clf_gp.named_steps['gp'].fit(X_train_dense, y_train)

# Convert test data to dense array
X_test_dense = clf_gp.named_steps['vectorizer'].transform(X_test).toarray()

# Make predictions on X_test
y_pred_GP = clf_gp.named_steps['gp'].predict(X_test_dense)

# Evaluate and print accuracy
gp_acc = accuracy_score(y_test, y_pred_GP)
print("Gaussian Process Classification Accuracy:", gp_acc)


Collecting pgmpy
  Downloading pgmpy-0.1.24-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pgmpy
Successfully installed pgmpy-0.1.24
Gaussian Process Classification Accuracy: 0.9479820627802691


In [None]:
df.info()
df['spam'] = (df['Category'] == 'spam').astype(int)

# Split the data into features (X) and target variable (y)
X_regression = df.drop(['IsSpam'], axis=1)  # Assuming 'Category' is the only categorical feature
y_regression = df['IsSpam']

# Split the data into training and testing sets
X_train_regression, X_test_regression, y_train_regression, y_test_regression = train_test_split(
    X_regression, y_regression, test_size=0.2, random_state=42
)

# Define the ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['spam']),  # Replace with actual numeric features
        ('cat', OneHotEncoder(), ['Category', 'Gender'])  # One-hot encode 'Category' and 'Gender'
    ]
)

# Define the Gaussian Process Regression pipeline
regression_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('gp', GaussianProcessRegressor(kernel=C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))))
])

# Fit the Gaussian Process Regression model
regression_pipeline.fit(X_train_regression, y_train_regression)


# Make predictions on X_test
y_pred_regression = regression_pipeline.predict(X_test_regression)


# Evaluate and print mean squared error
mse = mean_squared_error(y_test_regression, y_pred_regression)
print("Gaussian Process Regression Mean Squared Error:", mse)

# Define the Elastic Net Regression pipeline
elastic_net_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('elastic_net', ElasticNet(alpha=1.0, l1_ratio=0.5))  # Adjust alpha and l1_ratio as needed
])

# Fit the Elastic Net Regression model
elastic_net_pipeline.fit(X_train_regression, y_train_regression)

# Make predictions on X_test for Elastic Net Regression
y_pred_elastic_net = elastic_net_pipeline.predict(X_test_regression)

# Evaluate and print mean squared error for Elastic Net Regression
mse_elastic_net = mean_squared_error(y_test_regression, y_pred_elastic_net)
print("Elastic Net Regression Mean Squared Error:", mse_elastic_net)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5573 entries, 0 to 5572
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5573 non-null   object
 1   Message   5573 non-null   object
 2   IsSpam    5573 non-null   bool  
 3   Gender    5573 non-null   object
dtypes: bool(1), object(3)
memory usage: 136.2+ KB
Gaussian Process Regression Mean Squared Error: 6.729052904020631e-24
Elastic Net Regression Mean Squared Error: 0.12105210005318745


In [None]:


vectorizer = CountVectorizer()
X_lda = vectorizer.fit_transform(X)

# Apply Latent Dirichlet Allocation
lda = LatentDirichletAllocation(n_components=5, random_state=42)  # You can adjust the number of topics
lda.fit(X_lda)

# Print the top words for each topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-10 - 1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic #{topic_idx + 1}: {', '.join(top_words)}")



Topic #1: you, and, the, is, to, that, in, it, have, he
Topic #2: to, the, call, is, you, my, your, ok, lor, of
Topic #3: you, to, my, me, your, the, in, call, are, and
Topic #4: to, gt, lt, free, for, the, you, or, in, on
Topic #5: to, it, you, the, me, and, is, but, not, will


In [None]:
# Create a Bayesian network model
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/email.csv")
model = BayesianNetwork([('Category','Message'),('Message','IsSpam'),('Category','IsSpam'),('Gender','IsSpam'),('Gender','Category')])

# Ensure that the variable names in the model match the column names in the dataset with proper case sensitivity
model_columns = [col.lower() for col in model.nodes()]
df_columns = [col.lower() for col in df.columns]

print(model.nodes())
if set(model_columns) != set(df_columns):
    raise ValueError("Variable names in the model must be identical to column names in the dataset.")

# Fit the model using Bayesian parameter estimation
model.fit(df, estimator=BayesianEstimator, prior_type="BDeu")

# Print the CPDs (Conditional Probability Distributions)
for cpd in model.get_cpds():
    print(cpd)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| ... | 0.00012117099651026801   |
+-----+--------------------------+
| ... | 0.00012117099651026801   |
+-----+--------------------------+
| ... | 0.00012117099651026801   |
+-----+--------------------------+
| ... | 0.00012117099651026801   |
+-----+--------------------------+
| ... | 0.00012117099651026801   |
+-----+--------------------------+
| ... | 0.00012117099651026801   |
+-----+--------------------------+
| ... | 0.00012117099651026801   |
+-----+--------------------------+
| ... | 0.00012117099651026801   |
+-----+--------------------------+
| ... | 0.00012117099651026801   |
+-----+--------------------------+
| ... | 0.00012117099651026801   |
+-----+--------------------------+
| ... | 0.00012117099651026801   |
+-----+--------------------------+
| ... | 0.00012117099651026801   |
+-----+--------------------------+
| ... | 0.00012117099651026801   |
+-----+--------------------------+
| ... | 0.000121170996510