## Group Project - Investment Case Study
   #### Group Members:
    -  Vinayak Bandhu
    -  Sarathbabu Sankaran
    -  Puneet Agarwal
    -  Rakesh Gorai

In [1]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import sys,Numpy,pandas,chardet packages

import sys
import numpy as np
import pandas as pd
import chardet

In [3]:
# Reading data from different data files

companies = pd.read_csv('companies.txt', sep='\t', encoding='ISO-8859-1')
rounds2 = pd.read_csv('rounds2.csv', encoding='ISO-8859-1')
mapping = pd.read_csv('mapping.csv')

In [4]:
#Cleaning Data in Dataframes (removing special characters)

companies['permalink'] = companies.permalink.str.encode('utf-8').str.decode('ascii','ignore').str.lower()
rounds2['company_permalink'] = rounds2.company_permalink.str.encode('utf-8').str.decode('ascii','ignore').str.lower()

### Table 1.1 Analysis

In [5]:
# Unique companies present in rounds2 and companies files:

print(rounds2['company_permalink'].nunique())
print(companies['permalink'].nunique())

66368
66368


In [6]:
# Check to see which column can be used as unique key in companies dataframe

## Check for non-null
print(companies.isnull().sum())

## Check for Number of rows and columns:
print(companies.shape)

## Check for Number of unique values in each columns:
print(companies.nunique())

permalink            0
name                 1
homepage_url      5058
category_list     3148
status               0
country_code      6958
state_code        8547
region            8030
city              8028
founded_at       15221
dtype: int64
(66368, 10)
permalink        66368
name             66102
homepage_url     61191
category_list    27296
status               4
country_code       137
state_code         311
region            1092
city              5111
founded_at        3978
dtype: int64


In [7]:
# Check for any company in rounds2 which is not present in companies dataframe
~rounds2['company_permalink'].isin(companies['permalink']).all()

False

In [43]:
# Create master_frame

master_frame = pd.merge(rounds2.rename(columns={'company_permalink':'permalink'}), companies, how = 'outer', on = 'permalink')

In [44]:
master_frame.shape

(114949, 15)

### Cleaning the master_frame Dataset

In [45]:
# Storing the master_frame Dataset into a temp dataframe to check to retained rows percentage after cleaning the data
master_frame_temp=master_frame

# calculating number of null values in columns and their percentage

print(master_frame.isnull().sum())
round(100*(master_frame.isnull().sum()/len(master_frame.index)), 2)

permalink                      0
funding_round_permalink        0
funding_round_type             0
funding_round_code         83809
funded_at                      0
raised_amount_usd          19990
name                           1
homepage_url                6134
category_list               3410
status                         0
country_code                8678
state_code                 10946
region                     10167
city                       10164
founded_at                 20521
dtype: int64


permalink                   0.00
funding_round_permalink     0.00
funding_round_type          0.00
funding_round_code         72.91
funded_at                   0.00
raised_amount_usd          17.39
name                        0.00
homepage_url                5.34
category_list               2.97
status                      0.00
country_code                7.55
state_code                  9.52
region                      8.84
city                        8.84
founded_at                 17.85
dtype: float64

In [46]:
# Drop unnecessary columns

master_frame.drop(['funding_round_code','homepage_url','state_code','founded_at'],axis=1,inplace=True)
round(100*(master_frame.isnull().sum()/len(master_frame.index)), 2)

permalink                   0.00
funding_round_permalink     0.00
funding_round_type          0.00
funded_at                   0.00
raised_amount_usd          17.39
name                        0.00
category_list               2.97
status                      0.00
country_code                7.55
region                      8.84
city                        8.84
dtype: float64

In [47]:
# Remove rows with raised_amount_usd as null

master_frame=master_frame[~master_frame.raised_amount_usd.isnull()]
round(100*(master_frame.isnull().sum()/len(master_frame.index)), 2)

permalink                  0.00
funding_round_permalink    0.00
funding_round_type         0.00
funded_at                  0.00
raised_amount_usd          0.00
name                       0.00
category_list              1.10
status                     0.00
country_code               6.16
region                     7.42
city                       7.42
dtype: float64

In [50]:
# Check number of retained rows

round((master_frame.shape[0]/master_frame_temp.shape[0])*100,2)

82.61

### Table 2.1 Analysis

In [1]:
# Converting the raised_amount_usd from usd to usd million

master_frame[['raised_amount_usd']]=round(master_frame[['raised_amount_usd']].apply(lambda x:x/1000000),2)

NameError: name 'master_frame' is not defined

In [55]:
# Average funding amount in venture type

round(master_frame.loc[master_frame['funding_round_type']=='venture'].mean(),2)

raised_amount_usd    10.694886
dtype: float64


In [60]:
# Average funding amount in angel type

round(master_frame.loc[master_frame['funding_round_type']=='angel'].mean(),2)

raised_amount_usd    0.77
dtype: float64

In [58]:
# Average funding amount in seed type

round(master_frame.loc[master_frame['funding_round_type']=='seed'].mean(),2)

raised_amount_usd    0.568031
dtype: float64


In [62]:
# Average funding amount in private_equity type

round(master_frame.loc[master_frame['funding_round_type']=='private_equity'].mean(),2)

raised_amount_usd    63.3
dtype: float64