In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.chdir('/content/drive/My Drive/ThesisProject')

%ls

In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
!pip install pgeocode
import pgeocode
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import Ridge, Lasso
from itertools import combinations
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

!pip install openpyxl
df = pd.read_excel('british-ai-startups.xlsx')
df

In [None]:
df.columns

In [None]:
# Replace NaN values with an empty string
df['AIUseCase1'] = df['AIUseCase1'].fillna('')
df['AIUseCase2'] = df['AIUseCase2'].fillna('')

# Concatenate AIUseCase1 and AIUseCase2 columns
df['FullDescription'] = df['AIUseCase1'] + ' ' + df['AIUseCase2']

# Function to check for any form of 'personalization' or 'personalisation'
def check_personalization(row):
    keywords = ['personalization', 'personalisation']
    for col in ['Purpose1', 'Purpose2', 'Purpose3']:
        for keyword in keywords:
            if keyword in str(row[col]).lower():
                return 1
    return 0

# Apply the function to each row
df['Personalization'] = df.apply(check_personalization, axis=1)

# Convert 'IncorporationDate' to datetime
df['IncorporationDate'] = pd.to_datetime(df['IncorporationDate'])

# Extract the year and create a new column 'FoundedYear'
df['FoundedYear'] = df['IncorporationDate'].dt.year

# Function to convert 1.0 to 1 and NaN to 0
def convert_to_binary(column):
    return column.apply(lambda x: 1 if x == 1.0 else 0)

# Apply the function to the specified columns
df['prolific_investor'] = convert_to_binary(df['prolific_investor'])
df['large_investor'] = convert_to_binary(df['large_investor'])
df['us_investor'] = convert_to_binary(df['us_investor'])

# Convert boolean values to binary (True -> 1, False -> 0)
df['academic_founders'] = df['academic_founders'].astype(int)
df['expert_founders'] = df['expert_founders'].astype(int)
df['serial_founders'] = df['serial_founders'].astype(int)

In [None]:
# Function to standardize website URLs
def standardize_url(url):
    # Remove the protocol (http, https)
    url = re.sub(r'^https?://', '', url)
    # Add 'www.' prefix if not present
    if not url.startswith('www.'):
        url = 'www.' + url
    return url

# Apply the function to the Website column
df['Website'] = df['Website'].apply(standardize_url)

# Dictionary to map old values to new values
replacement_dict = {
    'b2b': 'B2B',
    'b2c': 'B2C',
    'b2b b2c third': 'B2M',
    'b2b b2c': 'B2M',
    'unclear': '',
    'b2b third': 'B2B',
    'third': '',
    'b2c third': 'B2C'
}

# Replace the values in the 'customer' column
df['customer'] = df['customer'].replace(replacement_dict)

In [None]:
# Drop the 'AIUseCase1' column
df = df.drop(columns=['AIUseCase1'])

# Drop the 'AIUseCase2' column
df = df.drop(columns=['AIUseCase2'])

# Drop the original 'IncorporationDate' column
df = df.drop(columns=['IncorporationDate'])

# Drop the 'value' column
df = df.drop(columns=['value'])

# Drop the 'platform' column
df = df.drop(columns=['platform'])

In [None]:
#Initialize the geolocation database for the UK
nomi = pgeocode.Nominatim('GB')

# Function to get the county from a postal code
def get_county(postal_code):
    location = nomi.query_postal_code(postal_code)
    if location is not None and location.county_name:
        return location.county_name
    else:
        return "Unknown"

# Apply the function to the DataFrame and create a new 'County' column
df['County'] = df['PostalCode'].apply(get_county)

# Function to get the place from a postal code
def get_place(postal_code):
    location = nomi.query_postal_code(postal_code)
    if location is not None:
        return location.place_name
    else:
        return "Unknown"

# Apply the function to the DataFrame and create a new 'Place' column
df['Place'] = df['PostalCode'].apply(get_place)

In [None]:
file_path = 'british-ai-startups-final.csv'
df.to_csv(file_path, index=False)
df.info()

In [None]:
# Setting up the visualizations
sns.set(style="whitegrid")
plt.figure(figsize=(18, 16))

# Distribution of Personalization
plt.subplot(3, 3, 1)
personalization_counts = df['Personalization'].value_counts()
sns.countplot(data=df, x='Personalization')
for i, count in enumerate(personalization_counts):
    plt.text(i, count, f'{count}\n({count/sum(personalization_counts)*100:.1f}%)', ha='center', va='bottom')
plt.title('Distribution of Personalization', pad=20)
plt.xlabel('Personalization')
plt.ylabel('Count')

# Top 10 sectors by the number of startups
plt.subplot(3, 3, 2)
sector_counts = df['Sector1'].value_counts().head(10)
sns.barplot(y=sector_counts.index, x=sector_counts.values, palette="viridis")
for i, (count, label) in enumerate(zip(sector_counts.values, sector_counts.index)):
    plt.text(count, i, f'{count}\n({count/sum(sector_counts)*100:.1f}%)', va='center')
plt.title('Top 10 Sectors by Number of Startups', pad=20)
plt.xlabel('Number of Startups')
plt.ylabel('Sector')

# Distribution of FoundedYear (without percentage annotations)
plt.subplot(3, 3, 3)
sns.histplot(data=df, x='FoundedYear', bins=15, kde=True, color='blue')
plt.title('Distribution of Founded Year', pad=20)
plt.xlabel('Founded Year')
plt.ylabel('Frequency')

# Customer Type Distribution (Pie Chart with percentages)
plt.subplot(3, 3, 4)
customer_counts = df['customer'].value_counts()
plt.pie(customer_counts, labels=customer_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("coolwarm", len(customer_counts)))
plt.title('Customer Type Distribution', pad=20)

# Count of Startups by County
plt.subplot(3, 3, 5)
county_counts = df['County'].value_counts().head(10)
sns.barplot(y=county_counts.index, x=county_counts.values, palette="magma")
for i, (count, label) in enumerate(zip(county_counts.values, county_counts.index)):
    plt.text(count, i, f'{count}\n({count/sum(county_counts)*100:.1f}%)', va='center')
plt.title('Top 10 Counties by Number of Startups', pad=20)
plt.xlabel('Number of Startups')
plt.ylabel('County')

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Plotting top 10 and bottom 10 sectors by median Funding and Valuation

plt.figure(figsize=(16, 16))

sector_funding_valuation = df.groupby('Sector1').agg({
    'FundingCumulative2023Q2': 'median',
    'LatestPreMoneyValuationGBP': 'median'
}).reset_index()

top10_funding = sector_funding_valuation.sort_values(by='FundingCumulative2023Q2', ascending=False).head(10)
bottom10_funding = sector_funding_valuation.sort_values(by='FundingCumulative2023Q2').head(10)
top10_valuation = sector_funding_valuation.sort_values(by='LatestPreMoneyValuationGBP', ascending=False).head(10)
bottom10_valuation = sector_funding_valuation.sort_values(by='LatestPreMoneyValuationGBP').head(10)


# Top 10 sectors by median funding
plt.subplot(4, 1, 1)
plt.barh(top10_funding['Sector1'], top10_funding['FundingCumulative2023Q2'], color='green')
plt.xlabel('Median Funding (GBP)')
plt.ylabel('Sectors')
plt.title('Top 10 Sectors by Median Funding')

# Bottom 10 sectors by median funding
plt.subplot(4, 1, 2)
plt.barh(bottom10_funding['Sector1'], bottom10_funding['FundingCumulative2023Q2'], color='red')
plt.xlabel('Median Funding (GBP)')
plt.ylabel('Sectors')
plt.title('Bottom 10 Sectors by Median Funding')

# Top 10 sectors by median valuation
plt.subplot(4, 1, 3)
plt.barh(top10_valuation['Sector1'], top10_valuation['LatestPreMoneyValuationGBP'], color='blue')
plt.xlabel('Median Valuation (GBP)')
plt.ylabel('Sectors')
plt.title('Top 10 Sectors by Median Valuation')

# Bottom 10 sectors by median valuation
plt.subplot(4, 1, 4)
plt.barh(bottom10_valuation['Sector1'], bottom10_valuation['LatestPreMoneyValuationGBP'], color='orange')
plt.xlabel('Median Valuation (GBP)')
plt.ylabel('Sectors')
plt.title('Bottom 10 Sectors by Median Valuation')

plt.tight_layout()
plt.show()

In [None]:
# Identifying binary variables
binary_columns = [col for col in df.columns if df[col].dropna().isin([0, 1]).all()]

# Analyzing binary variables w.r.t. Personalization
plt.figure(figsize=(18, 12))
sns.set(style="whitegrid")

# Create plots for each binary variable showing percentage of startups with and without Personalization
for i, col in enumerate(binary_columns[:-1]):  # Exclude 'Personalization' itself
    plt.subplot(3, 2, i + 1)
    counts = df.groupby([col, 'Personalization']).size().unstack(fill_value=0)
    counts_percentage = counts.div(counts.sum(axis=1), axis=0) * 100

    # Plotting the data
    counts_percentage.plot(kind='bar', stacked=True, color=['#ff9999','#66b3ff'], ax=plt.gca())
    plt.title(f'{col.replace("_", " ").capitalize()}')
    plt.xlabel(col.replace("_", " ").capitalize())
    plt.ylabel('Percentage')
    plt.xticks(rotation=0)
    for p in plt.gca().patches:
        plt.gca().annotate(f'{p.get_height():.1f}%', (p.get_x() * 1.005, p.get_y() + p.get_height() / 2), ha='center', va='center')
    plt.legend(title='Personalization', labels=['No', 'Yes'])

plt.tight_layout()
plt.show()

In [None]:
# Calculate the correlation matrix
correlation_data = df[['LatestPreMoneyValuationGBP', 'FundingCumulative2023Q2',
                         'Personalization', 'expert_founders', 'serial_founders',
                         'large_investor', 'us_investor', 'FoundedYear']]

correlations = correlation_data.corr()

# Setting up the figure for a better visualization of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlations, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title('Correlation Matrix of Success Metrics and Factors')
plt.tight_layout()
plt.show()


In [None]:
# Assuming 'df' contains the dataset
# Grouping data by 'Personalization' and calculating mean valuation and funding for each group
personalization_effect = df.groupby('Personalization').agg({
    'LatestPreMoneyValuationGBP': 'mean',
    'FundingCumulative2023Q2': 'mean'
}).reset_index()

# Plotting the results
plt.figure(figsize=(12, 6))

# Valuation plot
plt.subplot(1, 2, 1)
plt.bar(personalization_effect['Personalization'], personalization_effect['LatestPreMoneyValuationGBP'], color=['blue', 'orange'])
plt.xticks([0, 1], ['No Personalization', 'Personalization'])
plt.xlabel('Personalization')
plt.ylabel('Average Valuation (GBP)')
plt.title('Average Valuation by Personalization')

# Funding plot
plt.subplot(1, 2, 2)
plt.bar(personalization_effect['Personalization'], personalization_effect['FundingCumulative2023Q2'], color=['blue', 'orange'])
plt.xticks([0, 1], ['No Personalization', 'Personalization'])
plt.xlabel('Personalization')
plt.ylabel('Average Funding (GBP)')
plt.title('Average Funding by Personalization')

plt.tight_layout()
plt.show()

In [None]:
data = df.copy()

# Define the independent variables
X = data[['Personalization', 'expert_founders', 'serial_founders', 'prolific_investor', 'large_investor', 'us_investor', 'FoundedYear']]

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

In [None]:
# Define the dependent variables
dependent_vars = {
    'Funding': 'FundingCumulative2023Q2',
    'Valuation': 'LatestPreMoneyValuationGBP'
}

# Define the endogenous variable (independent variable of interest)
endog_var = 'Personalization'

# Define the set of potential instrumental variables
potential_instruments = ['academic_founders', 'expert_founders', 'serial_founders',
                         'prolific_investor', 'large_investor', 'us_investor',
                         'FoundedYear']

# Define other covariates (controls)
controls = ['expert_founders', 'serial_founders', 'prolific_investor', 'large_investor', 'us_investor']

# Store results
results = []

# Iterate over each dependent variable (Funding and Valuation)
for dep_label, dep_var in dependent_vars.items():
    best_model = None
    best_aic = float('inf')

    # Iterate over all potential instruments to find the best one
    for instrument in potential_instruments:
        if instrument == endog_var or instrument in controls:
            continue

        # First stage: Regress Personalization on the instrument and controls
        first_stage = sm.OLS(data[endog_var], sm.add_constant(data[[instrument] + controls])).fit()
        data['Personalization_hat'] = first_stage.predict()

        # Second stage: Regress the dependent variable on the predicted values from the first stage and controls
        second_stage = sm.OLS(data[dep_var], sm.add_constant(data[['Personalization_hat'] + controls])).fit()

        # Record the model with the lowest AIC
        if second_stage.aic < best_aic:
            best_aic = second_stage.aic
            best_model = (dep_label, instrument, second_stage)

    results.append(best_model)

# Print the best model summary for each dependent variable
for dep_label, instrument, model in results:
    print(f"Best Instrument for {dep_label}: {instrument}")
    print(model.summary())
    print("\n" + "="*80 + "\n")

In [None]:
# Define the treatment and outcome variables
treatment = data['Personalization']
outcome_valuation = data['LatestPreMoneyValuationGBP']
outcome_funding = data['FundingCumulative2023Q2']

# Define the covariates
covariates = data[['expert_founders', 'serial_founders', 'prolific_investor', 'large_investor', 'us_investor']]

# Standardize the covariates
scaler = StandardScaler()
covariates_scaled = scaler.fit_transform(covariates)

# Fit the logistic regression model to estimate propensity scores
logistic = LogisticRegression()
logistic.fit(covariates_scaled, treatment)
propensity_scores = logistic.predict_proba(covariates_scaled)[:, 1]

# Match the treated and control units using nearest neighbors on the propensity score
treated_indices = np.where(treatment == 1)[0]
control_indices = np.where(treatment == 0)[0]

nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
nn.fit(propensity_scores[control_indices].reshape(-1, 1))
distances, indices = nn.kneighbors(propensity_scores[treated_indices].reshape(-1, 1))

# Find matched control indices
matched_control_indices = control_indices[indices.flatten()]

# Calculate the Average Treatment Effect on the Treated (ATT) for valuation and funding
att_valuation = outcome_valuation[treated_indices].mean() - outcome_valuation[matched_control_indices].mean()
att_funding = outcome_funding[treated_indices].mean() - outcome_funding[matched_control_indices].mean()

print(f"ATT for Valuation: {att_valuation}")
print(f"ATT for Funding: {att_funding}")