# Visualizations

## Understanding and preparing data

In [None]:
# Import required libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset (titanic dataset from seaborn)
df = sns.load_dataset('titanic')

# View the first 10 rows of the DataFrame
df.head()

In [None]:
# Basic statistics to understand the data
df.describe()

In [None]:
# Detect missing data - Use Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Hiányzó adatok vizualizációja (heatmap)')
plt.show()

In [None]:
# Visualization to discover patterns and trends - Age vs Survival
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='age', hue='survived', multiple='stack', kde=True)
plt.title('Kor szerinti túlélés (Histplot)')
plt.xlabel('Kor')
plt.ylabel('Előfordulás')
plt.show()

In [None]:
# Visualization to examine the relationship between gender and survival rate
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='sex', hue='survived')
plt.title('Nemek szerinti túlélés (Countplot)')
plt.xlabel('Nem')
plt.ylabel('Előfordulás')
plt.show()

In [None]:
# Visualization to examine the relationship between fare and class
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='class', y='fare', palette='Set3')
plt.title('Viteldíjak osztály szerint (Boxplot)')
plt.xlabel('Osztály')
plt.ylabel('Viteldíj')
plt.show()

## Evaluation of model performance

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import learning_curve
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [None]:
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# For simplicity, we'll convert this into a binary classification problem (class 0 vs class 1)
# Filter the data to only include class 0 and class 1 for binary classification
X = X[y != 2]
y = y[y != 2]

In [None]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data for better neural network performance
X_train = X_train / np.max(X_train, axis=0)
X_test = X_test / np.max(X_test, axis=0)

In [None]:
# Build a simple neural network model in TensorFlow/Keras
model = Sequential([
    Dense(8, input_shape=(X_train.shape[1],), activation='relu'),  # First hidden layer
    Dense(1, activation='sigmoid')  # Output layer with sigmoid activation for binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=8, validation_data=(X_test, y_test))

In [None]:
# Make predictions on the test set
y_pred = (model.predict(X_test) > 0.5).astype("int32")
y_prob = model.predict(X_test).flatten()  # Probabilities for ROC curve

In [None]:
# 1. Confusion Matrix
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# 2. ROC Curve and AUC
# Calculate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

In [None]:
# 3. Learning Curve
# Manually simulate learning curve by recording the training/validation accuracy
epochs = range(1, 101)  # 100 epochs
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

# Plot learning curve
plt.figure(figsize=(8, 6))
plt.plot(epochs, train_acc, label='Training Accuracy', color='r')
plt.plot(epochs, val_acc, label='Validation Accuracy', color='g')
plt.title('Learning Curve')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.grid(True)
plt.show()

In [None]:
# 4. Classification Report
# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

## Detection of anomalies

In [None]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the Titanic dataset from seaborn
df = sns.load_dataset('titanic')

# Display the first few rows of the dataset
df.head()

In [None]:
# 1. Detecting Missing Data
# Visualize missing data using a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Missing Data Heatmap')
plt.show()

In [None]:
# 3. Anomaly Detection in Age Distribution
# Plotting the distribution of 'age' to detect anomalies
plt.figure(figsize=(10, 6))
sns.histplot(df['age'].dropna(), bins=30, kde=True)
plt.title('Age Distribution with Potential Outliers')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

## Decision support

In [None]:
# How to work the weights?