<a href="https://colab.research.google.com/github/rickic45/Data_Mining_Titanic/blob/main/data_mining_titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Understanding & Preparation


In [1]:
# Import all necessary libraries for our analysis. We will use:

import numpy as np              # For mathematical operations and array manipulation
import pandas as pd             # For handling and analyzing tabular data
import matplotlib.pyplot as plt # For creating plots
import seaborn as sns           # For advanced statistical visualizations

# Specific machine learning libraries from scikit-learn
from sklearn import datasets                                    # To load datasets
from sklearn.model_selection import train_test_split            # To split data into training and test sets
from sklearn.model_selection import GridSearchCV, cross_val_score # For hyperparameter tuning and validation
from sklearn.preprocessing import StandardScaler, MinMaxScaler  # For data normalization
from sklearn.decomposition import PCA                           # For dimensionality reduction
from sklearn.pipeline import Pipeline                           # To create preprocessing+model pipelines

# Evaluation metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Classification models
from sklearn.linear_model import LogisticRegression    # Logistic Regression
from sklearn.neighbors import KNeighborsClassifier     # K-Nearest Neighbors
from sklearn.svm import SVC                            # Support Vector Machine
from sklearn.tree import DecisionTreeClassifier        # Decision Tree
from sklearn.ensemble import (                         # Ensemble methods
    RandomForestClassifier,
    GradientBoostingClassifier
)
from sklearn.neural_network import MLPClassifier      # Multi-layer Perceptron (neural networks)

# To save models
import joblib

# To ignore warnings (optional)
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Load Titanic
from google.colab import files
uploaded = files.upload()

df = pd.read_csv("titanic.csv")

# Create new features
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
df['HasCabin'] = df['Cabin'].notnull().astype(int)

# Extract Title from Name
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
df['Title'] = df['Title'].replace(['Mme', 'Lady', 'Countess', 'Dona'], 'Mrs')
df['Title'] = df['Title'].replace(['Capt', 'Col', 'Major', 'Dr', 'Rev', 'Jonkheer', 'Don', 'Sir'], 'Rare')

# Drop unused columns
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)


KeyboardInterrupt: 

In [None]:
# Missing values
print("Missing values:")
print(df.isnull().sum())

# Duplicates
print("Duplicates:", df.duplicated().sum())

# Columns with same value
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
print("Constant columns:", constant_cols)


In [None]:

# Numeric distributions
df.hist(bins=20, figsize=(15, 10))
plt.suptitle("Histograms of Numeric Variables")
plt.tight_layout()
plt.show()


In [None]:

# List of numerical columns to plot
numerical_columns = ['Age', 'Fare', 'SibSp', 'Parch']

# Set up the figure size and layout for 4 subplots (vertically arranged)
fig, axes = plt.subplots(4, 1, figsize=(8, 16))

# Create a boxplot for each of the numerical columns
sns.boxplot(data=df, y='Age', palette='Set2', ax=axes[0])
axes[0].set_title('Boxplot for Age')
axes[0].set_xlabel('Values')
axes[0].set_ylabel('Age')

sns.boxplot(data=df, y='Fare', palette='Set2', ax=axes[1])
axes[1].set_title('Boxplot for Fare')
axes[1].set_xlabel('Values')
axes[1].set_ylabel('Fare')

sns.boxplot(data=df, y='SibSp', palette='Set2', ax=axes[2])
axes[2].set_title('Boxplot for SibSp')
axes[2].set_xlabel('Values')
axes[2].set_ylabel('SibSp')

sns.boxplot(data=df, y='Parch', palette='Set2', ax=axes[3])
axes[3].set_title('Boxplot for Parch')
axes[3].set_xlabel('Values')
axes[3].set_ylabel('Parch')

# Adjust layout for better spacing
plt.tight_layout()

# Show the plot
plt.show()


In [None]:

# Set the correlation threshold (e.g., 0.9)
threshold = 0.9

# Create a correlation matrix
corr = df.corr(numeric_only=True)

# Find the columns with high correlation
high_corr_columns = [column for column in corr.columns if any(abs(corr[column]) > threshold) and column not in corr.columns]

# Check how many columns were dropped
print(f"Columns with correlation higher than {threshold}: {high_corr_columns}")

# If no columns were dropped, continue with the original df
if high_corr_columns:
    df_reduced = df.drop(columns=high_corr_columns)
else:
    print("No columns were removed. The dataset may already be sparse.")
    df_reduced = df

# Plot the correlation matrix for the reduced dataframe (only if the reduced dataframe is not empty)
if not df_reduced.empty:
    # Create the correlation matrix for the reduced dataframe
    corr_reduced = df_reduced.corr(numeric_only=True)

    # Set figure size for better readability
    plt.figure(figsize=(12, 8))

    # Use a different color map and adjust the font size of annotations
    sns.heatmap(corr_reduced, annot=True, fmt=".2f", cmap='viridis', annot_kws={'size': 10},
                cbar_kws={'shrink': 0.8}, linewidths=0.5, square=True)

    # Title and display
    plt.title(f"Reduced Correlation Matrix (Threshold > {threshold})", fontsize=16)
    plt.show()
else:
    print("No data to plot. The dataframe is empty after applying the threshold.")


In [None]:
df.isnull().sum()


In [None]:
df_encoded = pd.get_dummies(df, drop_first=True)


In [None]:
df['log_Fare'] = np.log1p(df['Fare'])  # example on Fare


In [None]:
df.to_csv("titanic_cleaned.csv", index=False)


In [None]:
sns.pairplot(df[['Age', 'Fare', 'Pclass', 'SibSp', 'Parch']], diag_kind='kde')


In [None]:
df_corr = df.copy()
df_corr['Survived'] = pd.read_csv("titanic.csv")['Survived']  # if missing
corr_target = df_corr.corr(numeric_only=True)['Survived'].sort_values(ascending=False)
print(corr_target)


Clustering

In [None]:

# Load the cleaned dataset
df = pd.read_csv("titanic.csv")

# Drop rows with nulls in relevant columns
df_clust = df[['Age', 'Fare', 'Pclass', 'SibSp', 'Parch']].dropna()

# Normalize features (important for distance-based clustering)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_clust)


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

inertia = []
k_range = range(1, 11)

for k in k_range:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(X_scaled)
    inertia.append(km.inertia_)

# Plot elbow curve
plt.plot(k_range, inertia, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()


In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)
df_clust['kmeans_cluster'] = kmeans.fit_predict(X_scaled)

# View cluster means
print(df_clust.groupby('kmeans_cluster').mean())


In [None]:
from sklearn.neighbors import NearestNeighbors


# Use NearestNeighbors to guess epsilon
neigh = NearestNeighbors(n_neighbors=5)
nbrs = neigh.fit(X_scaled)
distances, _ = nbrs.kneighbors(X_scaled)

# Sort and plot distances
distances = np.sort(distances[:, 4])
plt.plot(distances)
plt.title("K-distance Graph for DBSCAN")
plt.xlabel("Points")
plt.ylabel("Distance to 5th nearest neighbor")
plt.show()


In [None]:
db = DBSCAN(eps=0.5, min_samples=5)
df_clust['dbscan_cluster'] = db.fit_predict(X_scaled)

# Count cluster labels
print(df_clust['dbscan_cluster'].value_counts())


In [None]:
from sklearn.decomposition import PCA
import seaborn as sns

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# K-Means plot
plt.figure()
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df_clust['kmeans_cluster'], palette='Set2')
plt.title("K-Means Clustering (2D PCA)")
plt.show()

# DBSCAN plot
plt.figure()
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df_clust['dbscan_cluster'], palette='Set1')
plt.title("DBSCAN Clustering (2D PCA)")
plt.show()


In [None]:
sns.countplot(x='kmeans_cluster', data=df_clust)
plt.title("K-Means Cluster Distribution")


In [None]:
for col in ['Age', 'Fare']:
    sns.violinplot(x='kmeans_cluster', y=col, data=df_clust)
    plt.title(f"{col} by Cluster")
    plt.show()


Classification

In [None]:
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("titanic.csv")

# Drop rows with missing values in key columns
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].dropna()

# One-hot encode categorical features
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

# Split X and y
X = df.drop('Survived', axis=1)
y = df['Survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

y_pred_tree = tree.predict(X_test)
print("Decision Tree:\n", classification_report(y_test, y_pred_tree))


In [None]:
# K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)
print("KNN:\n", classification_report(y_test, y_pred_knn))


In [None]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)
print("Naive Bayes:\n", classification_report(y_test, y_pred_nb))


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc


# Example for Decision Tree
cm = confusion_matrix(y_test, y_pred_tree)
ConfusionMatrixDisplay(confusion_matrix=cm).plot()
plt.title("Decision Tree Confusion Matrix")
plt.show()

# ROC Curve
y_proba_tree = tree.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba_tree)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f'Decision Tree (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], '--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(12,6))
plot_tree(tree, feature_names=X.columns, class_names=["No", "Yes"], filled=True)
plt.title("Decision Tree Structure")
plt.show()


Pattern Mining

In [None]:

# Load dataset
df = pd.read_csv("titanic.csv")

# Keep only relevant + categorical features
df = df[['Survived', 'Pclass', 'Sex', 'Embarked']].dropna()

# Convert to string type for transaction encoding
df = df.astype(str)

# One-hot encode
df_encoded = pd.get_dummies(df)


In [None]:
from mlxtend.frequent_patterns import apriori

# Find frequent itemsets with min support
frequent_itemsets = apriori(df_encoded, min_support=0.1, use_colnames=True)

# Sort by length of itemsets
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
print(frequent_itemsets.sort_values(by=['support', 'length'], ascending=False))


In [None]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.6)

# Sort by confidence and lift
rules_sorted = rules.sort_values(by=['lift', 'confidence'], ascending=False)
print(rules_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


In [None]:


# Histogram of confidence
plt.figure()
sns.histplot(rules['confidence'], bins=10, kde=False)
plt.title("Confidence Distribution")
plt.xlabel("Confidence")
plt.show()

# Histogram of lift
plt.figure()
sns.histplot(rules['lift'], bins=10, kde=False)
plt.title("Lift Distribution")
plt.xlabel("Lift")
plt.show()

In [None]:

df = pd.read_csv("titanic.csv")

# Keep and drop missing
df = df[['Fare', 'Pclass', 'Age', 'SibSp', 'Parch', 'Sex', 'Embarked']].dropna()

# One-hot encode
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

# Split
X = df.drop('Fare', axis=1)
y = df['Fare']


In [None]:
sns.scatterplot(data=rules, x='confidence', y='lift', size='support', hue='support', sizes=(20,200))
plt.title("Rules: Confidence vs Lift")


In [None]:
top_rules = rules.sort_values('lift', ascending=False).head(10)
for _, row in top_rules.iterrows():
    print(f"{set(row['antecedents'])} => {set(row['consequents'])} | Lift={row['lift']:.2f}")


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

# Model
reg = LinearRegression()
reg.fit(X_train, y_train)

# Predict
y_pred = reg.predict(X_test)

# Evaluate
print("R²:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("Coefficients:", list(zip(X.columns, reg.coef_)))


In [None]:
from sklearn.linear_model import Ridge, Lasso

# Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
print("Ridge R²:", ridge.score(X_test, y_test))

# Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
print("Lasso R²:", lasso.score(X_test, y_test))


In [None]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(max_depth=5, random_state=42)
tree.fit(X_train, y_train)

y_tree_pred = tree.predict(X_test)

print("Tree R²:", r2_score(y_test, y_tree_pred))


In [None]:

plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual Fare")
plt.ylabel("Predicted Fare")
plt.title("Prediction vs Actual")


In [None]:
residuals = y_test - y_pred
sns.histplot(residuals, bins=20, kde=True)
plt.title("Residuals Histogram")


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# ... (Your previous code for model training and prediction) ...

# Store evaluation metrics for each model
r2_linear = r2_score(y_test, y_pred)  # Assuming y_pred is from Linear Regression
mse_linear = mean_squared_error(y_test, y_pred)

r2_ridge = ridge.score(X_test, y_test)  # Assuming ridge is your Ridge model
mse_ridge = mean_squared_error(y_test, ridge.predict(X_test))

r2_lasso = lasso.score(X_test, y_test)