In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import chardet

#rawdata = open('D:\\Official\\Machine Learning\\Upgrad PG Program\\Investment Assignment\\companies.csv', 'rb').read()
#result = chardet.detect(rawdata)
#charenc = result['encoding']
#print(charenc)
#print(result)

In [None]:
# Reading Companies data , the default encoding utf-8 is giving error hence using the encoding ISO-8859-1 to read the file
companies = pd.read_csv("/kaggle/input/investment-assignment-eda/companiesList.csv",encoding="ISO-8859-1")
companies.head()

In [None]:
companies.sample(10)

In [None]:
companies.columns

In [None]:
companies.shape

In [None]:
companies.info()

# **Checkpoint 1: Data Cleaning 1**

In [None]:
#Checking number of null values in each column
companies.isnull().sum()

In [None]:
#Converting the data in the permalink column to lower case
companies["permalink"] = companies["permalink"].str.encode('utf-8').str.decode('ascii', 'ignore')
companies["permalink"] = companies["permalink"].str.lower()

**How many unique companies are present in the companies file?**

In [None]:
companies.describe()

In [None]:
#Reading rounds2 data to dataframe , the default encoding utf-8 is giving error hence using the encoding ISO-8859-1 to read the file
rounds2 = pd.read_csv("/kaggle/input/investment-assignment-eda/rounds2.csv",encoding="ISO-8859-1")

In [None]:
rounds2.head()

In [None]:
rounds2.sample(10)

In [None]:
#Checking number of null values in each column
rounds2.isnull().sum()

In [None]:
#Converting the data in the company_permalink column to lower case
rounds2["company_permalink"] = rounds2["company_permalink"].str.encode('utf-8').str.decode('ascii', 'ignore')
rounds2["company_permalink"] = rounds2["company_permalink"].str.lower()

**How many unique companies are present in rounds2?**

In [None]:
rounds2["company_permalink"].describe()

**Are there any companies in the rounds2 file which are not present in companies ?**

In [None]:
#Checking the difference in both the unique key columns
rounds2[~rounds2['company_permalink'].str.lower().isin(companies['permalink'].str.lower())].shape

In [None]:
#Creating Master Dataframe by merging both companies and rounds2 dataframes
master_frame = pd.merge(companies, rounds2, how="inner", left_on="permalink", right_on="company_permalink")

In [None]:
master_frame.head()

In [None]:
#deleting additional company_permalink column from the master dataframe after merging
master_frame =  master_frame.drop(['company_permalink'], axis=1) 
master_frame.head()

**How many observations are present in master_frame ?**

In [None]:
master_frame.shape

In [None]:
#Checking the percentage of missing values in each column
round(100*(master_frame.isnull().sum()/len(master_frame.index)), 2)

In [None]:
#Deleting unnecessary columns
master_frame = master_frame.drop('funding_round_code', axis=1)
master_frame = master_frame.drop('founded_at', axis=1)
master_frame = master_frame.drop('state_code', axis=1)
master_frame = master_frame.drop('region', axis=1)
master_frame = master_frame.drop('city', axis=1)

In [None]:
#Filling the null values in the raised_amount_usd with median based on each funding type
master_frame['raised_amount_usd'] = master_frame.groupby("funding_round_type").transform(lambda x: x.fillna(x.median()))

#Filtering the null rows in country_code, homepage_url, category_list columns
master_frame = master_frame[-master_frame["country_code"].isnull()]
master_frame = master_frame[-master_frame["homepage_url"].isnull()]
master_frame = master_frame[-master_frame["category_list"].isnull()]
round(100*(master_frame.isnull().sum()/len(master_frame.index)), 2)

**Now the data is cleanes without any missing values**

In [None]:
#Calculating the data loss
100-round(100*len(master_frame.index)/114942,2)

**We lost 12.7% data in the data cleaning process**

In [None]:
master_frame.shape

# **Checkpoint 2: Funding Type Analysis**

In [None]:
# boxplot of raised_amount_usd across various funding categories
plt.figure(figsize=(30, 15))
sns.boxplot(x='funding_round_type', y='raised_amount_usd',data=master_frame)
plt.yscale('log')
plt.ylabel("Raised Amount in USD")
plt.xlabel("Funding Type")
plt.title("Investments in Each Funding Type") 
plt.show()

In [None]:
plt.figure(figsize=(20, 20))
sns.countplot(y="funding_round_type", data=master_frame, order = master_frame['funding_round_type'].value_counts().index)
plt.ylabel("Funding Type")
plt.xlabel("Number of Investments")
plt.title("Investments in Each Funding Type") 
plt.show()

In [None]:
#Spark funds is interested in only 'venture', 'seed', 'angel', 'private_equity' fund types hence filtering the data accordingly
master_frame = master_frame[master_frame.funding_round_type.isin(['venture', 'seed', 'angel', 'private_equity'])]

In [None]:
master_frame.funding_round_type.value_counts()

In [None]:
# boxplot of raised_amount_usd across various funding categories
plt.figure(figsize=(10, 7))
sns.boxplot(x='funding_round_type', y='raised_amount_usd', palette = "inferno",data=master_frame)
plt.yscale('log')
plt.ylabel("Raised Amount in USD")
plt.xlabel("Funding Type")
plt.title("Investments in Each Funding Type") 
plt.show()

**The box plot analysis shows there are outliers in the data**

In [None]:
#Calculating the median for the funding_round_type
Fund_Type_Analysis = round(master_frame.groupby(by='funding_round_type')['raised_amount_usd'].median()/10**6,6).sort_values(ascending = False)
Fund_Type_Analysis

**This analysis shows "venture" type is the best investment type for spark funds based on the investment amount between 5 to 15 million USD per investment round**

# **Checkpoint 3: Country Analysis**

In [None]:
#Filtering the master dataframe for venture fund type
master_frame = master_frame[master_frame.funding_round_type.isin(['venture'])]

In [None]:
#Analysing the top9 countries based on the total investments
top9 = pd.DataFrame(round(master_frame.groupby(by='country_code')['raised_amount_usd'].sum()/10**6,6).sort_values(ascending = False).head(n=9))
top9.head(n=10)

In [None]:
#Filtering the top 3 english speaking countries
master_frame = master_frame[master_frame.country_code.isin(['USA','GBR','IND'])]

# **Checkpoint 4: Sector Analysis 1**

In [None]:
#Reading the mapping data
mapping = pd.read_csv("/kaggle/input/investment-assignment-eda/mappingList.csv")

In [None]:
mapping.head()

**The mapping data frame is a wide dataframe , this needs to be converted to narrow dataframe by mapping the category_list to each sector**

In [None]:
#Reading the sector columns
value_vars = list(mapping.columns[1:])

# take the setdiff() to get the rest of the variables
id_vars = list(np.setdiff1d(mapping.columns, value_vars))

print(value_vars, "\n")
print(id_vars)

In [None]:
#Mapping the category_list to sector
mapping = pd.melt(mapping, 
        id_vars = id_vars, 
        value_vars = value_vars)
mapping

In [None]:
#Filtering the value column
mapping = mapping[mapping['value']==1]
#Removing the value column
mapping = mapping.drop('value', axis=1)
#Renaming the variable column
mapping = mapping.rename(columns={'variable': 'main_sector'})

**In the master dataframe the category_list column has values that are appended together with "|" e.g "Application Platforms|Real Time|Social Network Media", we need only the 1st value "Application Platforms" from this**

In [None]:
#Splitting the category_list values by | and reading the first value from result
master_frame["category_list"] = master_frame["category_list"].apply(lambda x : str(x).split("|")[0])

In [None]:
mapping

In [None]:
#merging the mapping dataframe to master data frame
master_frame = pd.merge(master_frame, mapping, how="inner", left_on="category_list", right_on="category_list")

In [None]:
master_frame.head()

In [None]:
#Filtering the invest ment amount between 5 - 15 million 
master_frame = master_frame[(master_frame["raised_amount_usd"] >= 5000000) & (master_frame["raised_amount_usd"] <= 15000000)]

# **Checkpoint 5: Sector Analysis 2**

**Based on the country analysis we know the top 3 english speaking countries as "USA", "GBR", "IND"**

In [None]:
#Creating seperate datasets for each country for further analysis
D1 = master_frame[master_frame["country_code"]=="USA"]
D2 = master_frame[master_frame["country_code"]=="GBR"]
D3 = master_frame[master_frame["country_code"]=="IND"]

**1. Total number of investments (count)**

In [None]:
D1.shape

In [None]:
D2.shape

In [None]:
D3.shape

**2. Total amount of investment (USD)**

In [None]:
D1["raised_amount_usd"].sum()

In [None]:
D2["raised_amount_usd"].sum()

In [None]:
D3["raised_amount_usd"].sum()

**3. Top sector (based on count of investments)
4. Second-best sector (based on count of investments)
5. Third-best sector (based on count of investments)
6. Number of investments in the top sector (refer to point 3)
7. Number of investments in the second-best sector (refer to point 4)
8. Number of investments in the third-best sector (refer to point 5)**

In [None]:
D1_Temp = D1.pivot_table(values = 'raised_amount_usd',index = ['main_sector'], aggfunc = {'sum','count'})
D1_Temp["Country"] = "USA"
D1_Temp["main sector"] = D1_Temp.index
D1_Temp.sort_values(by="count",ascending=False)

In [None]:
D2_Temp = D2.pivot_table(values = 'raised_amount_usd',index = ['main_sector'], aggfunc = {'sum','count'})
D2_Temp["Country"] = "GBR"
D2_Temp["main sector"] = D2_Temp.index
D2_Temp.sort_values(by="count",ascending=False)

In [None]:
D3_Temp = D3.pivot_table(values = 'raised_amount_usd',index = ['main_sector'], aggfunc = {'sum','count'})
D3_Temp["Country"] = "IND"
D3_Temp["main sector"] = D3_Temp.index
D3_Temp.sort_values(by="count",ascending=False)

**9. For the top sector count-wise (point 3), which company received the highest investment?
10. For the second-best sector count-wise (point 4), which company received the highest investment?**

In [None]:
Company1D1 = D1[D1["main_sector"] == "Others"]
Company1D1.groupby("name")['raised_amount_usd'].sum().sort_values(ascending=False)

In [None]:
Company1D2 = D2[D2["main_sector"] == "Others"]
Company1D2.groupby("name")['raised_amount_usd'].sum().sort_values(ascending=False)

In [None]:
Company1D3 = D3[D3["main_sector"] == "Others"]
Company1D3.groupby("name")['raised_amount_usd'].sum().sort_values(ascending=False)

In [None]:
Company2D1 = D1[D1["main_sector"] == "Cleantech / Semiconductors"]
Company2D1.groupby("name")['raised_amount_usd'].sum().sort_values(ascending=False)

In [None]:
Company2D2 = D2[D2["main_sector"] == "Cleantech / Semiconductors"]
Company2D2.groupby("name")['raised_amount_usd'].sum().sort_values(ascending=False)

In [None]:
Company2D3 = D3[D3["main_sector"] == "News, Search and Messaging"]
Company2D3.groupby("name")['raised_amount_usd'].sum().sort_values(ascending=False)

# **Checkpoint 6: Plots**

**A plot showing the representative amount of investment in each funding type.**

In [None]:
plt.figure(figsize=(10, 7))
Fund_Type_Analysis.plot.bar()
plt.ylabel("Raised Amount in USD")
plt.xlabel("Funding Type")
plt.title("Investments in Each Funding Type") 
plt.show()

**A plot showing the top 9 countries against the total amount of investments of funding type FT**

In [None]:
plt.figure(figsize=(10, 7))
Eng_Spk = pd.DataFrame({'country_code': ['USA','CHN','GBR','IND','CAN','FRA','ISR','DEU','JPN'],
                        'English_Speaking': ['Y','N','Y','Y','Y','N','Y','N','N']})
top9 = pd.merge(top9, Eng_Spk, how="inner", left_on="country_code", right_on="country_code")
top9.reset_index(inplace=True)
sns.barplot(x= top9["country_code"],y =top9["raised_amount_usd"],hue=top9["English_Speaking"],ci=None)
plt.ylabel("Raised Amount in USD")
plt.xlabel("Country Code")
plt.title("Investments in the Top 9 Countries") 
plt.show()

**A plot showing the number of investments in the top 3 sectors of the top 3 countries on one chart**

In [None]:
Top3Sectorplot = pd.merge(D1_Temp.sort_values(by="count",ascending=False).head(3),D2_Temp.sort_values(by="count",ascending=False).head(3),how="outer")
Top3Sectorplot = pd.merge(Top3Sectorplot.sort_values(by="count",ascending=False),D3_Temp.sort_values(by="count",ascending=False).head(3),how="outer")
Top3Sectorplot

In [None]:
plt.figure(figsize=(10, 7))
sns.barplot(x='Country', y='count', hue='main sector',data=Top3Sectorplot)
plt.ylabel("Total Investment Count")
plt.xlabel("Top 3 Countries")
plt.title("Investments in the Top 3 Sectors of the Top 3 Countries") 
plt.show()