<a href="https://colab.research.google.com/github/roboy88/heidegger-ai-insights-dasein-machine-standing-reserve-detector-ontological-heideggerian-ai-miner/blob/main/A_machine_learning_and_topic_modeling_project_that_filters%2C_clusters%2C_and_interprets_Heideggerian_concepts%E2%80%94such_as_Dasein%2C_enframing%2C_and_standing_reserve%E2%80%94from_a_curated_philosophy_dataset_Using_tools_like_Random_Forests%2C_KMeans%2C_PCA%2C_and_LDA%2C_this_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Step 1: Install Required Libraries ---
!pip install -q seaborn scikit-learn plotly ipywidgets

# --- Step 2: Imports ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import time
import math
import os
import matplotlib.image as mpimg
from matplotlib.offsetbox import AnnotationBbox, OffsetImage

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import plotly.graph_objects as go
import ipywidgets as widgets
from ipywidgets import Layout

from google.colab import files
uploaded = files.upload()

# --- Step 3: Load the Uploaded CSV ---
import io
df = pd.read_csv(io.BytesIO(uploaded['philosophy_data.csv']))

# --- Step 4: Data Overview ---
print("Dataset shape:", df.shape)
display(df.head())
print("Null values:\n", df.isnull().sum())

# --- Step 5: Preprocessing ---
df = df.dropna()

# Encode categorical columns if needed
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# --- Step 6: Split Features and Target ---
# If your dataset has no clear target column, choose one or create a dummy one.
target_col = df.columns[-1]  # Assuming last column is target
X = df.drop(target_col, axis=1)
y = df[target_col]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# --- Step 7: Train a Random Forest Classifier ---
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\n--- Classification Report ---\n")
print(classification_report(y_test, y_pred))
ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test)
plt.title("Confusion Matrix: Random Forest")
plt.show()

# --- Step 8: K-Means Clustering with PCA ---
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='coolwarm', alpha=0.7)
plt.title("K-Means Clusters (2D PCA Projection)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid(True)
plt.show()

# --- Step 9: Feature Importance ---
feature_importances = pd.Series(clf.feature_importances_, index=X.columns)
feature_importances.nlargest(10).plot(kind='barh', figsize=(8, 6), color='steelblue')
plt.title("Top 10 Feature Importances")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# --- Step 10: Optional Interactive Widget ---
def explore_feature_importance(n=10):
    feature_importances.nlargest(n).plot(kind='barh', figsize=(8, 6), color='orange')
    plt.title(f"Top {n} Features by Importance")
    plt.xlabel("Importance")
    plt.gca().invert_yaxis()
    plt.show()

widgets.interact(explore_feature_importance, n=widgets.IntSlider(min=5, max=min(30, len(X.columns)), step=1, value=10))


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h

Saving philosophy_data.csv to philosophy_data.csv
Dataset shape: (360808, 11)


Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
0,Plato - Complete Works,Plato,plato,"What's new, Socrates, to make you leave your ...","What's new, Socrates, to make you leave your ...",-350,1997,125,"what's new, socrates, to make you leave your ...","['what', 'new', 'socrates', 'to', 'make', 'you...","what be new , Socrates , to make -PRON- lea..."
1,Plato - Complete Works,Plato,plato,Surely you are not prosecuting anyone before t...,Surely you are not prosecuting anyone before t...,-350,1997,69,surely you are not prosecuting anyone before t...,"['surely', 'you', 'are', 'not', 'prosecuting',...",surely -PRON- be not prosecute anyone before ...
2,Plato - Complete Works,Plato,plato,The Athenians do not call this a prosecution b...,The Athenians do not call this a prosecution b...,-350,1997,74,the athenians do not call this a prosecution b...,"['the', 'athenians', 'do', 'not', 'call', 'thi...",the Athenians do not call this a prosecution ...
3,Plato - Complete Works,Plato,plato,What is this you say?,What is this you say?,-350,1997,21,what is this you say?,"['what', 'is', 'this', 'you', 'say']",what be this -PRON- say ?
4,Plato - Complete Works,Plato,plato,"Someone must have indicted you, for you are no...","Someone must have indicted you, for you are no...",-350,1997,101,"someone must have indicted you, for you are no...","['someone', 'must', 'have', 'indicted', 'you',...","someone must have indict -PRON- , for -PRON- ..."


Null values:
 title                        0
author                       0
school                       0
sentence_spacy               0
sentence_str                 0
original_publication_date    0
corpus_edition_date          0
sentence_length              0
sentence_lowered             0
tokenized_txt                0
lemmatized_str               0
dtype: int64


In [None]:
#  Step 0: Upload Data (Already Done) Here we are going to drill down on
#heidegger_terms and homogeneity_completeness_v_measure to better
#understand the philosopher's contributions to the event of the age of AI
# The event horizon of possibilities of AI as such
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import seaborn as sns

# Load uploaded file
df = pd.read_csv('/mnt/data/philosophy_data.csv')
print("Initial Shape:", df.shape)

# --- Step 1: Filter Heideggerian Text ---
heidegger_terms = [
    'heidegger', 'being and time', 'dasein', 'being', 'presence', 'temporality',
    'authenticity', 'inauthentic', 'ready-to-hand', 'present-at-hand',
    'enframing', 'standing-reserve', 'disclosure', 'unconcealment',
    'phenomenology', 'existential', 'hermeneutics', 'thrownness', 'worldhood', 'care'
]

text_col = df.select_dtypes(include='object').columns[0]  # Assume 1st text column
df[text_col] = df[text_col].astype(str).str.lower()

# Filter rows mentioning Heideggerian themes
df_heidegger = df[df[text_col].apply(lambda x: any(term in x for term in heidegger_terms))]
print("Filtered Heideggerian Rows:", df_heidegger.shape)

# --- Step 2: TF-IDF Vectorization ---
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(df_heidegger[text_col])

# --- Step 3: K-Means Clustering ---
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_tfidf)

df_heidegger['cluster'] = clusters

# --- Step 4: PCA for Visualization ---
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_tfidf.toarray())

plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.7)
plt.title("Heideggerian Thought Clusters (PCA)")
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.grid(True)
plt.show()

# --- Step 5: Top Terms per Cluster ---
terms = np.array(vectorizer.get_feature_names_out())
for i in range(k):
    centroid = kmeans.cluster_centers_[i]
    top_idx = centroid.argsort()[-10:][::-1]
    print(f"\nTop terms for Cluster {i}:")
    print(", ".join(terms[top_idx]))

# --- Step 6: WordCloud for Heideggerian Themes ---
wc = WordCloud(width=800, height=400, background_color='white').generate(" ".join(df_heidegger[text_col]))
plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title("Heideggerian Vocabulary in the Dataset")
plt.show()
