# IMPORTING NECESSARY LIBRARIES

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# INITIAL ANALYSIS OF THE DATASET

In [None]:
# Load the dataset
file_path = "/kaggle/input/global-ai-tool-adoption-across-industries/ai_adoption_dataset.csv"
df = pd.read_csv(file_path)

# Display basic information about the dataset
df_info = df.info()
df_head = df.head()
df_shape = df.shape
df_summary = df.describe(include='all')

df_shape, df_head, df_summary

# The dataset contains 145,000 rows and 9 columns related to AI tool adoption across countries, industries, demographics, and feedback.

# # 🧾 Dataset Overview
* Column Name-Description-Type
* country-Name of the country-object
* industry-Industry sector-object
* ai_tool-Name of the AI tool (e.g., ChatGPT, Midjourney)-object
* adoption_rate-Percentage adoption rate (%)-float64
* daily_active_users-Number of daily active users-int64
* year-Year of the data observation-int64
* user_feedback- Textual user feedback (possibly for NLP analysis)-object
* age_group-Age group of users (e.g., 18–24, 35–44, 55+)-object
* company_size-Size category of companies (Startup, SME, Enterprise)-object

# # 🔍 Key Observations
1. No missing values — every column is fully populated.

2. The user_feedback column contains unique strings — this could be used later for NLP sentiment analysis if needed.

3. adoption_rate ranges from 0% to 100%, with a mean ~50%, implying a balanced spread.

4. The dataset spans 2 years: 2023 and 2024.

# ##  Most frequent:

1. Country: Australia

2. Industry: Manufacturing

3. AI Tool: ChatGPT

4. Age Group: 55+

5. Company Size: Startup



# DATA PREPROCESSING

In [None]:
# Set general style for seaborn
sns.set(style="whitegrid")

# Check for duplicates
duplicates = df.duplicated().sum()

# Check for outliers in numeric columns using boxplots
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.boxplot(y=df['adoption_rate'])
plt.title("Boxplot of Adoption Rate")

plt.subplot(1, 2, 2)
sns.boxplot(y=df['daily_active_users'])
plt.title("Boxplot of Daily Active Users")
plt.tight_layout()
plt.show()

duplicates

# 🧹 Preprocessing Summary
* ✅ No duplicate rows detected in the dataset.

# 📦 Outlier Check (Boxplots):

* Adoption Rate has a uniform distribution from 0% to 100%, no extreme outliers.

* Daily Active Users shows a slight right-skew with values concentrated below ~9,000 — but nothing alarming.

# UNIVARIATE, BIVARIATE AND MULTIVARIATE ANALYSIS

In [None]:
# Univariate Distribution Plots - Interactive with Plotly
# 1. Adoption Rate Distribution
fig_adoption = px.histogram(df, x="adoption_rate", nbins=50, title="Distribution of AI Adoption Rate (%)",
                            labels={"adoption_rate": "Adoption Rate (%)"}, marginal="box", color_discrete_sequence=["indianred"])

# 2. Daily Active Users Distribution
fig_dau = px.histogram(df, x="daily_active_users", nbins=50, title="Distribution of Daily Active Users",
                       labels={"daily_active_users": "Daily Active Users"}, marginal="box", color_discrete_sequence=["dodgerblue"])

# 3. Country-wise Frequency
fig_country = px.bar(df['country'].value_counts().reset_index(),
                     x='country', y='count',
                     title="Number of Records per Country",
                     labels={'index': 'Country', 'country': 'Count'},
                     color='country', color_discrete_sequence=px.colors.qualitative.Set2)

# 4. Industry Frequency
fig_industry = px.bar(df['industry'].value_counts().reset_index(),
                      x='industry', y='count',
                      title="Number of Records per Industry",
                      labels={'industry': 'Industry', 'count': 'Record Count'},
                      color='industry',
                      color_discrete_sequence=px.colors.qualitative.Set3)


# 5. AI Tool Usage Frequency
fig_tool = px.pie(df, names='ai_tool', title='AI Tool Usage Distribution')

# Show all figures
fig_adoption.show()
fig_dau.show()
fig_country.show()
fig_industry.show()
fig_tool.show()

# 📊 Univariate Analysis Summary
1. Adoption Rate (%)
* Distribution is fairly uniform with a slight skew toward higher values.

* Median is ~50%, with a considerable number of industries at very low (<10%) and very high (>90%) adoption.

2. Daily Active Users (DAU)
* Right-skewed distribution: many records cluster between 2,000–8,000 users/day.

* Indicates variability in tool usage intensity.

3. Country Representation
Top 3 countries by record count:

* Australia

* USA

* UK

4. Industry Representation
Leading industries:

* Manufacturing

* Technology

* Finance

5. AI Tool Popularity
* Dominated by ChatGPT, followed by Midjourney and Stable Diffusion.

In [None]:
# Bivariate Analysis: Adoption Rate by Categorical Features
plt.figure(figsize=(16, 24))

# 1. Adoption Rate by Country
plt.subplot(3, 2, 1)
sns.boxplot(data=df, x='adoption_rate', y='country', palette='Set2')
plt.title("Adoption Rate Distribution by Country")

# 2. Adoption Rate by Industry
plt.subplot(3, 2, 2)
sns.boxplot(data=df, x='adoption_rate', y='industry', palette='Set3')
plt.title("Adoption Rate Distribution by Industry")

# 3. Adoption Rate by AI Tool
plt.subplot(3, 2, 3)
sns.boxplot(data=df, x='adoption_rate', y='ai_tool', palette='Set1')
plt.title("Adoption Rate by AI Tool")

# 4. Adoption Rate by Age Group
plt.subplot(3, 2, 4)
sns.boxplot(data=df, x='adoption_rate', y='age_group', palette='coolwarm')
plt.title("Adoption Rate by Age Group")

# 5. Adoption Rate by Company Size
plt.subplot(3, 2, 5)
sns.boxplot(data=df, x='adoption_rate', y='company_size', palette='viridis')
plt.title("Adoption Rate by Company Size")

# 6. Adoption Rate by Year
plt.subplot(3, 2, 6)
sns.boxplot(data=df, x='adoption_rate', y='year', palette='Pastel1')
plt.title("Adoption Rate by Year")

plt.tight_layout()
plt.show()

# 📈 Bivariate Analysis – Key Insights
1. Adoption Rate by Country
* USA and UK show higher median adoption rates.

* Countries like India and Brazil show wider interquartile ranges, indicating varied levels of adoption across sectors.

2. Adoption Rate by Industry
* Technology and Finance industries have higher adoption medians.

* Education and Agriculture show more modest adoption trends.

3. Adoption Rate by AI Tool
* ChatGPT consistently has higher adoption rates across the board.

* Tools like Midjourney and Stable Diffusion have slightly lower and more varied adoption.

4. Adoption Rate by Age Group
* Users aged 35–44 and 25–34 show the highest median adoption.

* Younger (18–24) and older (55+) groups have greater variance but lower medians.

5. Adoption Rate by Company Size
* Startups lead in AI tool adoption.

* SMEs and Enterprises follow, but with tighter spreads—indicating more consistent adoption practices.

6. Adoption Rate by Year
* There’s a notable jump in adoption rates from 2023 to 2024, suggesting rapid acceleration in AI integration.



In [None]:
# 1. Grouped Insights - Average Adoption by Country & Industry
grouped_country_industry = df.groupby(['country', 'industry'])['adoption_rate'].mean().reset_index()
grouped_country_industry_pivot = grouped_country_industry.pivot(index='country', columns='industry', values='adoption_rate')

# 2. Time-based Trends - Yearly Adoption Rate per AI Tool
yearly_trends = df.groupby(['year', 'ai_tool'])['adoption_rate'].mean().reset_index()

# 3. Correlation Matrix of Numeric Features
correlation_matrix = df[['adoption_rate', 'daily_active_users', 'year']].corr()

# Plotting grouped heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(grouped_country_industry_pivot, annot=True, fmt=".1f", cmap="YlGnBu")
plt.title("Average AI Adoption Rate by Country and Industry")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Plotting time-based trends
plt.figure(figsize=(10, 6))
sns.lineplot(data=yearly_trends, x='year', y='adoption_rate', hue='ai_tool', marker='o')
plt.title("AI Adoption Rate Trends by Tool (2023 vs 2024)")
plt.ylabel("Average Adoption Rate (%)")
plt.tight_layout()
plt.show()

# Plotting correlation matrix
plt.figure(figsize=(6, 4))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix of Numerical Features")
plt.tight_layout()
plt.show()

# 📊 Grouped Insights + Trends + Correlation
 1. Average Adoption Rate by Country and Industry (Heatmap)
* USA & UK show high adoption in Technology, Finance, and Manufacturing.

* Agriculture and Education have relatively lower adoption across most countries.

* India and Brazil have more moderate, diversified adoption rates.

2. AI Adoption Trends by Tool (2023 vs 2024)
* All tools show an increase in adoption from 2023 to 2024.

* ChatGPT leads in adoption, followed by Midjourney and Stable Diffusion.

* Tools like Bard and Claude also show notable growth in 2024.

 3. Correlation Matrix
* Metric	      Correlation with Adoption Rate
* Daily Active Users	+0.72 → Strong Positive
* Year	            +0.34 → Mild Positive

* 📈 More daily users are strongly associated with higher adoption.

* 📅 Year-over-year growth implies time is a key driver of adoption.

# ADVANCED ANALYSIS

In [None]:
# Prepare clustering data: average adoption metrics by country
cluster_data = df.groupby('country')[['adoption_rate', 'daily_active_users']].mean()

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(cluster_data)

# Determine optimal number of clusters using Elbow Method
inertia = []
k_range = range(1, 10)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Curve
plt.figure(figsize=(8, 4))
plt.plot(k_range, inertia, marker='o')
plt.title("Elbow Method for Optimal K (Country Clustering)")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.tight_layout()
plt.show()

# 🔍 Elbow Method Insight – Optimal K
1.  The Elbow Curve shows a noticeable bend at k = 3, suggesting that 3 clusters is likely optimal for segmenting countries based on:

* Average AI adoption rate

* Average daily active users

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(scaled_data)

# Add cluster labels back to the original DataFrame
cluster_data['Cluster'] = cluster_labels

# Reduce dimensions for visualization using PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

# Create DataFrame for plotting
pca_df = pd.DataFrame(pca_result, columns=['PCA1', 'PCA2'])
pca_df['Country'] = cluster_data.index
pca_df['Cluster'] = cluster_labels

# Plot the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='Cluster', palette='Set2', s=100)
for i in range(pca_df.shape[0]):
    plt.text(pca_df['PCA1'][i] + 0.02, pca_df['PCA2'][i], pca_df['Country'][i], fontsize=9)
plt.title("Country Clustering Based on AI Adoption and Usage Patterns")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()

cluster_data.reset_index()

# 🌍 Country Clustering Based on AI Usage Patterns
1. Using KMeans (k=3) on:

* Average AI adoption rate

* Daily active users per country

we obtained 3 distinct clusters, visualized through PCA above.

| Cluster | Characteristics                      | Countries                                      |
| ------- | ------------------------------------ | ---------------------------------------------- |
| **0**   | Balanced usage & moderate adoption   | Australia, Canada, Germany, India, South Korea |
| **1**   | Slightly lower adoption & higher DAU | China, France, USA                             |
| **2**   | Slightly lower DAU & adoption        | Brazil, UK                                     |


# ✨ Observations:
1. Cluster 0 includes tech-forward nations with balanced AI integration.

2. Cluster 1 shows intensive use (DAU) but not necessarily highest adoption rates.

3. Cluster 2 is more conservative in both adoption and usage.

In [None]:
# Prepare data for clustering at company level
# Group by company_size and age_group and get average adoption_rate and DAU
company_grouped = df.groupby(['company_size', 'age_group'])[['adoption_rate', 'daily_active_users']].mean().reset_index()

# Encode categorical features
le_size = LabelEncoder()
le_age = LabelEncoder()
company_grouped['company_size_encoded'] = le_size.fit_transform(company_grouped['company_size'])
company_grouped['age_group_encoded'] = le_age.fit_transform(company_grouped['age_group'])

# Final features for clustering
X_company = company_grouped[['adoption_rate', 'daily_active_users', 'company_size_encoded', 'age_group_encoded']]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_company)

# Determine optimal number of clusters using Elbow method
inertia = []
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot Elbow Curve
plt.figure(figsize=(8, 4))
plt.plot(range(1, 10), inertia, marker='o')
plt.title("Elbow Method for Optimal K (Company Clustering)")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.tight_layout()
plt.show()

# 📉 Elbow Method for Company Clustering
1. The Elbow Curve indicates a clear bend at k = 3, suggesting that 3 clusters is optimal for grouping company types based on:

* Average adoption rate

* Average daily active users

* Company size

* Age group of users



In [None]:
# Apply KMeans clustering with k=3
kmeans_company = KMeans(n_clusters=3, random_state=42, n_init=10)
company_grouped['Cluster'] = kmeans_company.fit_predict(X_scaled)

# PCA for visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X_scaled)
pca_df = pd.DataFrame(pca_result, columns=['PCA1', 'PCA2'])
pca_df['Cluster'] = company_grouped['Cluster']
pca_df['Company Size'] = company_grouped['company_size']
pca_df['Age Group'] = company_grouped['age_group']

# Plot the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='Cluster', palette='Set1', s=100)
for i in range(pca_df.shape[0]):
    plt.text(pca_df['PCA1'][i] + 0.03, pca_df['PCA2'][i], f"{pca_df['Company Size'][i]} | {pca_df['Age Group'][i]}", fontsize=8)
plt.title("Clustering of Company Profiles Based on AI Usage Patterns")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()

company_grouped[['company_size', 'age_group', 'adoption_rate', 'daily_active_users', 'Cluster']]

# 🏢 Company-Level AI Adoption Clustering (k=3)
1. We clustered company segments based on:

* Company size (Startup, SME, Enterprise)

* Age group

* Average adoption rate

* Daily active users

| Cluster | Profile Characteristics                                                             | Example Segments                           |
| ------- | ----------------------------------------------------------------------------------- | ------------------------------------------ |
| **0**   | 🚀 **High adopters**, mainly **Startups & SMEs**, especially **older age groups**   | Startup 55+, SME 35–44, Startup 35–44      |
| **1**   | ⚖️ **Moderate usage**, present in all sizes but particularly **Enterprises**        | Enterprise 25–34, SME 18–24, Startup 25–34 |
| **2**   | 🧊 **Low to moderate usage**, skewed towards **Enterprises with mature age groups** | Enterprise 18–24 & 55+, SME 45–54          |


# 🎯 Insights:
1. Startups dominate the high adoption cluster, especially for mid to older age groups.

2. Enterprises show conservative patterns—mostly clustering in lower segments despite similar DAU.

3. Age 35–44 and 55+ consistently appears in higher adoption clusters across company types.