## Group Project - Investment Case Study
   #### Group Members:
    -  Vinayak Bandhu
    -  Sarathbabu Sankaran
    -  Puneet Agarwal
    -  Rakesh Gorai

In [37]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [70]:
# Import sys,Numpy,pandas,chardet packages

import sys
import numpy as np
import pandas as pd
import PyPDF2

In [39]:
# Reading data from different data files

companies = pd.read_csv('companies.txt', sep='\t', encoding='ISO-8859-1')
rounds2 = pd.read_csv('rounds2.csv', encoding='ISO-8859-1')
mapping = pd.read_csv('mapping.csv')
englist_countries = pd.

In [40]:
#Cleaning Data in Dataframes (removing special characters)

companies['permalink'] = companies.permalink.str.encode('utf-8').str.decode('ascii','ignore').str.lower()
rounds2['company_permalink'] = rounds2.company_permalink.str.encode('utf-8').str.decode('ascii','ignore').str.lower()

### Checkpoint 1: Data Cleaning 1

In [41]:
# Unique companies present in rounds2 and companies files:

print(rounds2['company_permalink'].nunique())
print(companies['permalink'].nunique())

66368
66368


In [42]:
# Check to see which column can be used as unique key in companies dataframe

## Check for non-null
print(companies.isnull().sum())

## Check for Number of rows and columns:
print(companies.shape)

## Check for Number of unique values in each columns:
print(companies.nunique())

permalink            0
name                 1
homepage_url      5058
category_list     3148
status               0
country_code      6958
state_code        8547
region            8030
city              8028
founded_at       15221
dtype: int64
(66368, 10)
permalink        66368
name             66102
homepage_url     61191
category_list    27296
status               4
country_code       137
state_code         311
region            1092
city              5111
founded_at        3978
dtype: int64


In [43]:
# Check whether any company in rounds2 which is not present in companies dataframe
~rounds2['company_permalink'].isin(companies['permalink']).all()

False

In [44]:
# Create master_frame

master_frame = pd.merge(rounds2.rename(columns={'company_permalink':'permalink'}), companies, how = 'outer', on = 'permalink')

In [45]:
# No of observations in master_frame

master_frame.shape[0]

114949

In [52]:
# Brief look at the master_dataframe

print(master_frame.shape)
print(master_frame.dtypes)
print(master_frame.describe)
print(master_frame.info)

(114949, 15)
permalink                   object
funding_round_permalink     object
funding_round_type          object
funding_round_code          object
funded_at                   object
raised_amount_usd          float64
name                        object
homepage_url                object
category_list               object
status                      object
country_code                object
state_code                  object
region                      object
city                        object
founded_at                  object
dtype: object
<bound method NDFrame.describe of                                                 permalink  \
0                                     /organization/-fame   
1                                  /organization/-qounter   
2                                  /organization/-qounter   
3                     /organization/-the-one-of-them-inc-   
4                                   /organization/0-6-com   
5                          /organization/004-tec

#### Cleaning the master_frame dataset

In [53]:
# Storing the master_frame Dataset into a temp dataframe to check to retained rows percentage after cleaning the data
master_frame_temp=master_frame

# calculating number of null values in columns and their percentage

print(master_frame.isnull().sum())
round(100*(master_frame.isnull().sum()/len(master_frame.index)), 2)

permalink                      0
funding_round_permalink        0
funding_round_type             0
funding_round_code         83809
funded_at                      0
raised_amount_usd          19990
name                           1
homepage_url                6134
category_list               3410
status                         0
country_code                8678
state_code                 10946
region                     10167
city                       10164
founded_at                 20521
dtype: int64


permalink                   0.00
funding_round_permalink     0.00
funding_round_type          0.00
funding_round_code         72.91
funded_at                   0.00
raised_amount_usd          17.39
name                        0.00
homepage_url                5.34
category_list               2.97
status                      0.00
country_code                7.55
state_code                  9.52
region                      8.84
city                        8.84
founded_at                 17.85
dtype: float64

In [54]:
# Drop unnecessary columns

master_frame.drop(['funding_round_code','homepage_url','state_code','founded_at'],axis=1,inplace=True)
round(100*(master_frame.isnull().sum()/len(master_frame.index)), 2)

permalink                   0.00
funding_round_permalink     0.00
funding_round_type          0.00
funded_at                   0.00
raised_amount_usd          17.39
name                        0.00
category_list               2.97
status                      0.00
country_code                7.55
region                      8.84
city                        8.84
dtype: float64

In [55]:
# Remove rows with total number of null values 5 or more

master_frame=master_frame[(master_frame.isnull().sum(axis=1))<4]

# Remove rows with raised_amount_usd as null

master_frame=master_frame[~master_frame.raised_amount_usd.isnull()]
round(100*(master_frame.isnull().sum()/len(master_frame.index)), 2)

permalink                  0.00
funding_round_permalink    0.00
funding_round_type         0.00
funded_at                  0.00
raised_amount_usd          0.00
name                       0.00
category_list              0.61
status                     0.00
country_code               5.70
region                     6.97
city                       6.96
dtype: float64

In [56]:
# Check number of retained rows

round((master_frame.shape[0]/master_frame_temp.shape[0])*100,2)

82.21

### Checkpoint 2: Funding Type Analysis

In [57]:
# Converting the raised_amount_usd from usd to usd million

master_frame[['raised_amount_usd']]=round(master_frame[['raised_amount_usd']].apply(lambda x:x/1000000),2)

In [58]:
# Average funding amount in venture type

round(master_frame.loc[master_frame['funding_round_type']=='venture'].mean(),2)

raised_amount_usd    11.73
dtype: float64

In [59]:
# Average funding amount in angel type

round(master_frame.loc[master_frame['funding_round_type']=='angel'].mean(),2)

raised_amount_usd    0.95
dtype: float64

In [60]:
# Average funding amount in seed type

round(master_frame.loc[master_frame['funding_round_type']=='seed'].mean(),2)

raised_amount_usd    0.72
dtype: float64

In [61]:
# Average funding amount in private_equity type

round(master_frame.loc[master_frame['funding_round_type']=='private_equity'].mean(),2)

raised_amount_usd    73.08
dtype: float64

In [71]:
# Determine the best investment type for 5-15m USD allowable investment constraint

group_by_sector = master_frame.groupby('funding_round_type')
group_by_sector['raised_amount_usd'].mean().sort_values(ascending=False).head(10)

funding_round_type
post_ipo_debt           168.704539
post_ipo_equity          82.300048
secondary_market         79.649333
private_equity           73.083562
undisclosed              19.364551
debt_financing           17.047811
venture                  11.728223
grant                     4.317797
convertible_note          1.460542
product_crowdfunding      1.369818
Name: raised_amount_usd, dtype: float64

### Checkpoint 3: Country Analysis

In [79]:
# Top nine countries(based on the total investment amount each country has received)

#Create temporary dataframe to contain only venture type investments data
venture_investment_frame = master_frame.loc[master_frame['funding_round_type'] == 'venture']

#Create top9 dataframe by grouping the venture_investment_dataframe
group_by_country=venture_investment_frame.groupby('country_code')
top9 = group_by_country['raised_amount_usd'].sum().sort_values(ascending=False).head(9)
print(top9)

country_code
USA    422510.47
CHN     39835.29
GBR     20245.59
IND     14391.91
CAN      9583.32
FRA      7259.50
ISR      6907.50
DEU      6347.04
JPN      3363.70
Name: raised_amount_usd, dtype: float64


### Checkpoint 4: Sector Analysis 1