Ungrouped

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from tabulate import tabulate

startup_data = pd.read_csv('startup_data.csv')

# Replace NaN values with 0
startup_data.fillna(0, inplace=True)

# Extracting the relevant column data
funding_amounts = startup_data['Funding Amount in $']
investor_count = startup_data['No. of Investors']

# Calculate statistics for Funding Amount
funding_statistics = [
    ['Mean', np.mean(funding_amounts)],
    ['Median', np.median(funding_amounts)],
    ['Mode', funding_amounts.mode()[0]],
    ['Range', np.ptp(funding_amounts)],
    ['Variance', np.var(funding_amounts)],
    ['Standard Deviation', np.std(funding_amounts)],
    ['Skewness', skew(funding_amounts)]
]

# Calculate statistics for Number of Investors
investor_statistics = [
    ['Mean', np.mean(investor_count)],
    ['Median', np.median(investor_count)],
    ['Mode', investor_count.mode()[0]],
    ['Range', np.ptp(investor_count)],
    ['Variance', np.var(investor_count)],
    ['Standard Deviation', np.std(investor_count)],
    ['Skewness', skew(investor_count)]
]

# Display the results in tabular form
print("Funding Amount Statistics:")
print(tabulate(funding_statistics, headers=['Statistic', 'Value'], tablefmt='grid'))

print("\nNumber of Investors Statistics:")
print(tabulate(investor_statistics, headers=['Statistic', 'Value'], tablefmt='grid'))

Funding Amount Statistics:
+--------------------+--------------+
| Statistic          |        Value |
| Mean               |  3.14865e+08 |
+--------------------+--------------+
| Median             |  4.10877e+07 |
+--------------------+--------------+
| Mode               |  1.5e+07     |
+--------------------+--------------+
| Range              |  2.47676e+10 |
+--------------------+--------------+
| Variance           |  2.64415e+18 |
+--------------------+--------------+
| Standard Deviation |  1.62608e+09 |
+--------------------+--------------+
| Skewness           | 12.1667      |
+--------------------+--------------+

Number of Investors Statistics:
+--------------------+----------+
| Statistic          |    Value |
| Mean               | 10.55    |
+--------------------+----------+
| Median             |  9       |
+--------------------+----------+
| Mode               |  6       |
+--------------------+----------+
| Range              | 45       |
+--------------------+----

Grouped

In [2]:
import pandas as pd
import numpy as np
from tabulate import tabulate

startup_data = pd.read_csv('startup_data.csv')

# Replace NaN values in the 'No. of Investors' column with a default value (0 in this case)
startup_data['No. of Investors'].fillna(0, inplace=True)

# Convert the 'No. of Investors' column to ranges of 100 difference
startup_data['Investor Range'] = startup_data['No. of Investors'].apply(lambda x: f"{int(x // 100) * 100}-{int(x // 100) * 100 + 100}")

# Calculate frequency of each investor count value
investor_frequency = startup_data['No. of Investors'].value_counts().reset_index()
investor_frequency.columns = ['No. of Investors', 'Frequency']

# Merge frequency data with the original DataFrame
startup_data = startup_data.merge(investor_frequency, on='No. of Investors')

# Print the dataset after merging frequency
print("Dataset with Frequency:")
print(startup_data)

# Save the DataFrame to a new CSV file named 'grouped.csv'
grouped_file_path = '/content/grouped.csv'
startup_data.to_csv(grouped_file_path, index=False)

# Calculate mean using the formula ∑(investor_count * frequency) / total observations
mean_value = (startup_data['No. of Investors'] * startup_data['Frequency']).sum() / startup_data['Frequency'].sum()

# Calculate median
def calculate_median(df):
    cumulative_frequency = 0
    median_class = None
    for idx, row in df.iterrows():
        cumulative_frequency += row['Frequency']
        if cumulative_frequency >= df['Frequency'].sum() / 2:
            median_class = row
            break
    mid_point = median_class['No. of Investors']
    median_value = mid_point
    return median_value

median_value = calculate_median(startup_data)

# Calculate mode
def calculate_mode(df):
    modal_class = df.loc[df['Frequency'].idxmax()]
    mid_point = modal_class['No. of Investors']
    mode_value = mid_point
    return mode_value

mode_value = calculate_mode(startup_data)

# Calculate range
range_value = startup_data['No. of Investors'].max() - startup_data['No. of Investors'].min()

# Calculate variance
variance_value = ((startup_data['Frequency'] * (startup_data['No. of Investors'] - mean_value)**2).sum()) / (startup_data['Frequency'].sum() - 1)

# Calculate standard deviation
std_dev_value = np.sqrt(variance_value)

# Calculate skewness
skewness_value = (mean_value - mode_value) / std_dev_value

# Prepare data for tabulate
data = [
    ['Mean', mean_value],
    ['Median', median_value],
    ['Mode', mode_value],
    ['Range', range_value],
    ['Variance', variance_value],
    ['Standard Deviation', std_dev_value],
    ['Skewness', skewness_value]
]

# Display the results in tabular form
print(tabulate(data, headers=['Statistic', 'Value'], tablefmt='grid'))


Dataset with Frequency:
     Unnamed: 0                               Company       City  \
0             0                         Urban Company    Gurgaon   
1            17                                 Groww  Bengaluru   
2            68                              Pee Safe    Gurgaon   
3           103                           Housing.com    Gurgaon   
4           144                Bombay Shaving Company  New Delhi   
..          ...                                   ...        ...   
295         284                                   TVF     Mumbai   
296         169                                Shuttl    Gurgaon   
297         181                      VerSe Innovation  Bengaluru   
298         177  Digital Class E-Learning Marketplace  Ahmedabad   
299         285            WOW Skin Science India Ltd  Bengaluru   

     Starting Year                                           Founders  \
0             2014  Abhiraj Singh Bhal, Raghav Chandra, Varun Khaitan   
1            