# 1. Installing and Importing Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# To Display plots
%matplotlib inline
sns.set(style="whitegrid")

# 2. Load and Inspect the CSV Data

In [None]:
df_raw = pd.read_csv("timeline_data.csv")

print("Raw Data (first 5 rows):")
display(df_raw.head())

print("\nColumns Detected:", df_raw.columns.tolist())

# Make a working copy
df = df_raw.copy()

# 3. Data Cleaning

## 3.1 Clean & Parse 'Month' Column

In [None]:
df = df_raw.copy()  # Work on a copy
# Remove leading/trailing spaces
df["Month"] = df["Month"].astype(str).str.strip()
# Remove internal spaces if any
df["Month"] = df["Month"].str.replace(r"\s+", "", regex=True)
# Convert "Jan-04" style strings to datetime
# format='%b-%y' means "abbreviated month - 2-digit year"
df["Month"] = pd.to_datetime(df["Month"], format="%b-%y", errors="coerce")

# Check if any rows did not parse properly
bad_rows = df[df["Month"].isna()]
if not bad_rows.empty:
    print("⚠️ Some rows could not be parsed as dates:")
    display(bad_rows)
else:
    print("✅ All Month values parsed successfully.")

df.head()

## 3.2 Replace "<1" and Convert Keyword Columns to Float

In [None]:
keywords = [
    "Smartphone: (Worldwide)",
    "Artificial intelligence: (Worldwide)",
    "Wearable technology: (Worldwide)",
    "Online shopping: (Worldwide)",
    "Electric vehicle: (Worldwide)"
]

for col in keywords:
    df[col] = df[col].replace("<1", 0.5).astype(float)

df.head()

## 3.3 Create a Tech_Wave Label (Tech 2.0 if before 2016, Tech 3.0 otherwise)

In [None]:
df["Tech_Wave"] = df["Month"].apply(
    lambda x: "Tech 2.0" if x < pd.Timestamp("2016-01-01") else "Tech 3.0"
)

print("\nCleaned Data (first 5 rows):")
display(df.head())


# 4. Exploratory Data Analysis

In [None]:
plt.figure(figsize=(12, 6))
for col in keywords:
    plt.plot(df["Month"], df[col], label=col.split(":")[0])
plt.axvline(pd.Timestamp("2016-01-01"), color="red", linestyle="--", label="Start Tech 3.0")
plt.title("Consumer Interest Over Time (Google Trends)")
plt.xlabel("Year")
plt.ylabel("Search Interest (0-100 scale)")
plt.legend()
plt.show()


# 5. Clustering and Classification for Tech 2.0 and Tech 3.0

In [None]:
def run_clustering_classification(data, wave_label):
    print(f"Analyzing {wave_label} Data")

    # Extract features
    X = data[keywords].values

    # Standardize data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Clustering
    # Choose number of clusters (e.g., 3 clusters)
    kmeans = KMeans(n_clusters=3, random_state=42)
    data["Cluster"] = kmeans.fit_predict(X_scaled)

    # Display cluster averages to understand characteristics
    cluster_means = data.groupby("Cluster")[keywords].mean()
    print("Cluster Averages:")
    display(cluster_means)

    # Plot bar charts for each cluster's keyword averages
    cluster_means.plot(kind="bar", figsize=(10, 6))
    plt.title(f"Average Consumer Interest per Cluster in {wave_label}")
    plt.xlabel("Cluster")
    plt.ylabel("Average Interest")
    plt.legend(loc="upper right")
    plt.show()

    # Classification
    # Here we treat the cluster labels as the target variable.
    # Our goal: How well can a classifier predict a month’s cluster based on its features?
    X_class = X_scaled
    y_class = data["Cluster"]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_class, y_class, test_size=0.2, random_state=42
    )

    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("\nClassification Report (Predicting Cluster)")
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Feature Importance Plot
    importances = pd.Series(clf.feature_importances_, index=keywords)
    importances.sort_values().plot(kind="barh", figsize=(8, 4), color="skyblue")
    plt.title(f"Feature Importance for Predicting Cluster in {wave_label}")
    plt.xlabel("Importance")
    plt.ylabel("Keyword")
    plt.show()

# Subset data for each tech wave
df_tech2 = df[df["Tech_Wave"] == "Tech 2.0"].copy()
df_tech3 = df[df["Tech_Wave"] == "Tech 3.0"].copy()

# Run clustering and classification for Tech 2.0
run_clustering_classification(df_tech2, "Tech 2.0")

# Run clustering and classification for Tech 3.0
run_clustering_classification(df_tech3, "Tech 3.0")
