# Naive Bayes

Task 1: Exploratory Data Analysis (EDA)

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import svm
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix


In [None]:
data=pd.read_csv(r"C:\Users\dorkar\Documents\conda\DS\Naive Bayes\mushroom.csv")
data.head()

EDA : Exploratory Data Analysis

In [None]:
data.info()

In [None]:
data.columns

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data.corr()

In [None]:
data.columns

Data Visualization

In [None]:
data.hist()

In [None]:
data.boxplot()

In [None]:
# Outlier removal 
# Detect outliers using IQR method
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

# Define a function to identify outliers
def detect_outliers_iqr(data):
    outliers = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR)))
    return outliers

# Apply the function to the dataframe
outliers = detect_outliers_iqr(data)

# Print outliers summary
print(outliers.sum())

# Remove outliers from the dataframe
data = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]
data

In [None]:
sns.heatmap(data.corr(),annot=True)

Data Encoding

In [None]:
label=LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = label.fit_transform(data[column])
data

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='class', data=data)
plt.title('Class Distribution')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='class', data=data)
plt.title('Class Distribution')
plt.show()

In [None]:
plt.hist(data)
plt.show()

In [None]:
plt.boxplot(data)
plt.show()

In [None]:
sns.pairplot(data)

In [None]:
data['stalk_height']=data['stalk_height'].astype(int)
data['cap_diameter']=data['cap_diameter'].astype(int)
data

In [None]:
data.columns

Split data

In [None]:
#spliting the data
x=data.drop(['class'],axis=1)
y=data['class']
x,y

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,random_state=42)

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train,x_test

In [None]:
svm_model=svm.SVC(kernel='linear')

In [None]:
svm_model.fit(x_train,y_train)

In [None]:
y_pred=svm_model.predict(x_test)
y_pred

In [None]:
print(classification_report(y_pred,y_test))

In [None]:
print(accuracy_score(y_pred,y_test))

In [None]:
cm=confusion_matrix(y_pred,y_test)
cm

Task 5: Visualization of SVM Results

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

Hyper Parameter tunning

In [None]:
# SVM hyper parameter
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}
# Instantiate the GridSearchCV object
clf= GridSearchCV(svm_model, param_grid=param_grid, cv=5, scoring='accuracy')
# Fit the model to the training data
clf.fit(x_train, y_train)

In [None]:
y_pred=clf.predict(x_test)

In [None]:
print(classification(y_pred,y_test))

In [None]:
print(accuracy(y_pred,y_test))

In [None]:
# Experiment with different SVM hyperparameters
# Example: Experimenting with different kernel types
from sklearn.svm import SVC
kernels = ['linear', 'poly', 'rbf']
for kernel in kernels:
    svm_model = SVC(kernel=kernel)
    svm_model.fit(x_train, y_train)
    y_pred = svm_model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Kernel:", kernel)
    print("Accuracy:", accuracy)
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))