## Group Project - Investment Case Study
   #### Group Members:
    -  Vinayak Bandhu
    -  Sarathbabu Sankaran
    -  Puneet Agarwal
    -  Rakesh Gorai

In [1]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import sys,Numpy,pandas,chardet packages

import sys
import numpy as np
import pandas as pd
import chardet

In [3]:
# Understanding encoding format in the text files

print(chardet.detect(open('companies.txt','rb').read()))
print(chardet.detect(open('rounds2.csv','rb').read()))

{'encoding': 'Windows-1254', 'confidence': 0.461086087463078, 'language': 'Turkish'}
{'encoding': 'Windows-1254', 'confidence': 0.4186155476629225, 'language': 'Turkish'}


In [5]:
# Reading data from different data files

companies = pd.read_csv('companies.txt', sep='\t', encoding='ISO-8859-1')
rounds2 = pd.read_csv('rounds2.csv', encoding='ISO-8859-1')
mapping = pd.read_csv('mapping.csv')

In [77]:
#Cleaning Data in Dataframes (removing special characters)

companies['permalink'] = companies.permalink.str.encode('utf-8').str.decode('ascii','ignore').str.lower()
rounds2['company_permalink'] = rounds2.company_permalink.str.encode('utf-8').str.decode('ascii','ignore').str.lower()

### Table 1.1 Analysis

In [102]:
# Unique companies present in rounds2 and companies files:

print(rounds2['company_permalink'].nunique())
print(companies['permalink'].nunique())

66368
66368


In [103]:
# Check to see which column can be used as unique key in companies dataframe

## Check for non-null
print(companies.isnull().sum())

## Check for Number of rows and columns:
print(companies.shape)

## Check for Number of unique values in each columns:
print(companies.nunique())

permalink            0
name                 1
homepage_url      5058
category_list     3148
status               0
country_code      6958
state_code        8547
region            8030
city              8028
founded_at       15221
dtype: int64
(66368, 10)
permalink        66368
name             66102
homepage_url     61191
category_list    27296
status               4
country_code       137
state_code         311
region            1092
city              5111
founded_at        3978
dtype: int64


In [25]:
# Check for any company in rounds2 which is not present in companies dataframe
~rounds2['company_permalink'].isin(companies['permalink']).all()

False

In [26]:
# Create master_frame

master_frame = pd.merge(rounds2.rename(columns={'company_permalink':'permalink'}), companies, how = 'outer', on = 'permalink')

In [27]:
master_frame.shape

(114949, 15)

### Cleaning the master_frame

In [29]:
master_frame.isnull().sum()

permalink                      0
funding_round_permalink        0
funding_round_type             0
funding_round_code         83809
funded_at                      0
raised_amount_usd          19990
name                           1
homepage_url                6134
category_list               3410
status                         0
country_code                8678
state_code                 10946
region                     10167
city                       10164
founded_at                 20521
dtype: int64

In [30]:
round(100*(master_frame.isnull().sum()/len(master_frame.index)), 2)

permalink                   0.00
funding_round_permalink     0.00
funding_round_type          0.00
funding_round_code         72.91
funded_at                   0.00
raised_amount_usd          17.39
name                        0.00
homepage_url                5.34
category_list               2.97
status                      0.00
country_code                7.55
state_code                  9.52
region                      8.84
city                        8.84
founded_at                 17.85
dtype: float64

In [45]:
# Drop unnecessary columns

master_frame.drop(['state_code'],axis=1,inplace=True)
round(100*(master_frame.isnull().sum()/len(master_frame.index)), 2)

permalink                  0.00
funding_round_permalink    0.00
funding_round_type         0.00
funded_at                  0.00
raised_amount_usd          0.00
name                       0.00
category_list              1.77
status                     0.00
country_code               6.26
region                     7.57
city                       7.57
dtype: float64

In [46]:
# Drop unecessary rows using columns with high Null percentages
master_frame = master_frame[(master_frame.isnull().sum(axis=1)) <= 5]
round(100*(master_frame.isnull().sum()/len(master_frame.index)), 2)

permalink                  0.00
funding_round_permalink    0.00
funding_round_type         0.00
funded_at                  0.00
raised_amount_usd          0.00
name                       0.00
category_list              1.77
status                     0.00
country_code               6.26
region                     7.57
city                       7.57
dtype: float64

In [72]:
# Clean NAN values

master_frame['raised_amount_usd'].fillna(value=0,inplace=True)

# Convert raised_amount_usd to mil.

master_frame['raised_amount_usd']=round(master_frame[['raised_amount_usd']].apply(lambda x:x/1000000),2)
round(100*(master_frame.isnull().sum()/len(master_frame.index)), 2)

KeyError: 'raised_amount_usd'

### Table 2.1 Analysis

In [55]:
# Average funding amount in venture type

round(master_frame.loc[master_frame['funding_round_type']=='venture'].mean(),2)

raised_amount_usd    10.694886
dtype: float64


In [60]:
# Average funding amount in angel type

round(master_frame.loc[master_frame['funding_round_type']=='angel'].mean(),2)

raised_amount_usd    0.77
dtype: float64

In [58]:
# Average funding amount in seed type

round(master_frame.loc[master_frame['funding_round_type']=='seed'].mean(),2)

raised_amount_usd    0.568031
dtype: float64


In [62]:
# Average funding amount in private_equity type

round(master_frame.loc[master_frame['funding_round_type']=='private_equity'].mean(),2)

raised_amount_usd    63.3
dtype: float64

In [64]:
print(master_frame)

                                                permalink  \
0                                     /organization/-fame   
1                                  /organization/-qounter   
2                                  /organization/-qounter   
3                     /organization/-the-one-of-them-inc-   
4                                   /organization/0-6-com   
5                          /organization/004-technologies   
6                        /organization/01games-technology   
7                     /organization/0ndine-biomedical-inc   
8                     /organization/0ndine-biomedical-inc   
9                                    /organization/0xdata   
10                                   /organization/0xdata   
11                                   /organization/0xdata   
12                                   /organization/0xdata   
13                                        /organization/1   
14                                        /organization/1   
15                      