In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency 


### 1) Read data and get quick overview about dataset using pandas

In [None]:

df = pd.read_csv('/kaggle/input/indian-startup-funding/startup_funding.csv',parse_dates=['Date dd/mm/yyyy'])
df

In [None]:
### find total nan values in each column

df.isnull().sum()

Observations : there are many NaNs values in different columns so we can to preprocessed or clean the data 

In [None]:
df.dtypes

## 2) Data Preprocessing and Data Cleaning

### 2.1) preprocessing on Date column

Observations : i want to convert Date dd/mm/yyyy column into datetime but some how i am not able to do that so i have to find all
invalid dates so that we can correct them

In [None]:
invalid_dates = []
## write re to match date
date_pattern = '\d{2}/\d{2}/\d{4}'
for i in range(len(df)):
    d = df.loc[i,'Date dd/mm/yyyy']
    res = re.findall(date_pattern,d)
    if len(res) <=0:
        ## append with index
        invalid_dates.append([i,d])
else:
    [print(x) for x in invalid_dates]

As you can see all invalid dates and there correspond index value was printed

there are few invalid dates which contains . and // so let's replace it with /

In [None]:
def findInvalidDates(df):

    invalid_dates = []
    for i in range(len(df)):
        d = df.loc[i,'Date dd/mm/yyyy']
        res = re.findall(date_pattern,d)
        if len(res) <=0:
            
            if(d.find("//")!=-1):
                d = d.replace("//",'/')
            elif (d.find(".")!=-1):
                d = d.replace(".",'/')
            else:
                invalid_dates.append([i,d])

            df.loc[i,'Date dd/mm/yyyy'] = d
            
    return f"remaining invalid dates : {invalid_dates}"

In [None]:
findInvalidDates(df)

Observations : only 3 dates are invalid so we have to correct them manually otherwise it will take much time to write program for them

In [None]:
df.loc[192,'Date dd/mm/yyyy'] = "05/07/2018"
df.loc[2571,'Date dd/mm/yyyy'] = "01/07/2015"
df.loc[2606,'Date dd/mm/yyyy'] = "01/07/2015"

findInvalidDates(df)

Observations : Now there is no invalid date in Date column , so we can convert it into datetime

In [None]:
df['Date dd/mm/yyyy'] = pd.to_datetime(df['Date dd/mm/yyyy'])
df.dtypes

### 2.2) preprocessing on StartupName column

In [None]:
## this is program to find invalid stratup names

# re to match urls/website links
link_pattern = "(http)|(www.\w+)"
invalid_names = []
for i in range(len(df)):
    d = df.loc[i,'Startup Name']
    res = re.findall(link_pattern,d)
    
    if len(res) >0:
        invalid_names.append([i,d])
invalid_names

as we can see that there is only one entry where startupname was wrong   , so correct it manually

In [None]:
df.loc[3,'Startup Name'] = "wealthbucket"
df

### 2.3) To fill nan values in columns we have to draw some of visuals so we can identify some data patterns.

In [None]:
## this function return dataframe which show relation between 2 categories in term of counts
def showNaNValuesForEachItemInCategory(col):
    g=df.groupby(col)

    return -g.count().sub(g.size(),0)
showNaNValuesForEachItemInCategory('InvestmentnType')

In [None]:
## most of nan values are for Private Equity , seed funding

df[df['InvestmentnType']=="Private Equity"]["Industry Vertical"].value_counts()

In [None]:
df[df['Industry Vertical']=='Consumer Internet']["Industry Vertical"].value_counts()

In [None]:
### most of the private equality and seed funding investmenttype have industry vertical is consumer internet
df['Industry Vertical'] = df['Industry Vertical'].fillna("Consumer Internet")
df.isnull().sum()
df['Industry Vertical'].value_counts()

In [None]:
df['City  Location'].value_counts()

Observations : there are some invalid city names in city location column so first make that correct

In [None]:
invalid_location = []
for i in range(len(df)):
    
    l = str(df.loc[i,'City  Location'])
    res = re.findall('[^a-zA-Z\s]+',l)
    
    if (len(res) > 0):
        
        invalid_location.append([i,l])
else:
    [print(x) for x in invalid_location]
    

In [None]:
def removeSpecialChars(s):
    """ this function remove special characters from string"""
    s = re.sub('[^a-zA-Z\s]','',s)
    return s

In [None]:
## just test function 
removeSpecialChars("del///2/*hi")

In [None]:
invalid_location = []
for i in range(len(df)):
    
    l = str(df.loc[i,'City  Location'])
    res = re.findall('[^a-zA-Z\s]+',l)
    
    if (len(res) > 0):
        if(l.find('/')!=-1):
            s = l.split("/")[0]
            s = removeSpecialChars(s)
            df.loc[i,'City  Location'] = s
        else:
            invalid_location.append([i,l])
else:
    [print(x) for x in invalid_location]

### many of the city names are corrected , but few of them are still not corrected 

In [None]:
## correct all city names which contains xc2 as substring

for i in range(len(df)):
    l = str(df.loc[i,'City  Location'])
    if l.find("xc2")!=-1:
        ## if string contains substring xc2 then remove go for further spliting 
        df.loc[i,'City  Location'] = l.split("0")[-1]
    
        

In [None]:
df['City  Location'].unique()

Now there is no invalid values in city location column ,so we can peform further observation

In [None]:
showNaNValuesForEachItemInCategory("City  Location")

In [None]:
df["Industry Vertical"]

In [None]:
## use stastical test to check city location is dependent on any other column or not 


def checkDependencyBetweenCategories(cat1,cat2):
    data = pd.crosstab(index=df[cat1],columns = df[cat2])
    stat, p, dof, expected = chi2_contingency(data) 

    # interpret p-value 
    alpha = 0.05
    print("p value is " + str(p)) 
    if p <= alpha: 
        print(cat1 ,' is Dependent  on ',cat2," (reject H0)") 
    else: 
        print(cat1 , " & ",cat2,'bath are Independent (H0 holds true)') 

In [None]:
checkDependencyBetweenCategories("City  Location","Industry Vertical")
checkDependencyBetweenCategories("City  Location","InvestmentnType")

Observation : City names are depend on both Industry vertical and investType

In [None]:
# let's try to identify some patterns
df[pd.isnull(df['City  Location'])]['Industry Vertical'].value_counts()

In [None]:
df[df['Industry Vertical']=="Consumer Internet"]['City  Location'].value_counts()

Observations : most of the enteries for Banglore and mumbai in consumer internet type

In [None]:
def getMaxCountValueBetweenTwoCats(cat1,cat2):
    b = pd.crosstab(df[cat1],df[cat2])
    a = b.values
    (i,j) = np.unravel_index(a.argmax(),a.shape)
    print(f"For index : {b.index[i]} and column : {b.columns[j]} max value is : {a.max()}")

In [None]:
getMaxCountValueBetweenTwoCats("City  Location","Industry Vertical")

In [None]:
getMaxCountValueBetweenTwoCats("City  Location","InvestmentnType")

In [None]:
getMaxCountValueBetweenTwoCats("Industry Vertical","InvestmentnType")

In [None]:
df['InvestmentnType'].value_counts()

In [None]:

## fill nan values in City location column based on values of industry vertical , InvestmentnType

for i in range(len(df)):
    d = df.loc[i,"City  Location"]
    
    if(pd.isnull(d)):
        if(df.loc[i,"Industry Vertical"]=="Consumer Internet"):
            df.loc[i,"City  Location"] = "Bangalore"
    #     if(df.loc[i,"InvestmentnType"] == "PrivateEquity"):
    #         df.loc[i,"City  Location"] = "Bangalore"

In [None]:
for i in range(len(df)):
    d = df.loc[i,"City  Location"]
    
    
    if(pd.isnull(df.loc[i,"City  Location"])):
        df.loc[i,"City  Location"] = "Mumbai"
    
    
else:
    df['City  Location'].isnull().sum()

In [None]:
## bengaluru and bangalore both are same .....
df.loc[df['City  Location']=="Bengaluru","City  Location"] = "Bangalore"
df['City  Location'].value_counts()
## there are too many cities let's draw wordcloud for it

In [None]:
from wordcloud import WordCloud, STOPWORDS

## its return a image we can plot that image using plt.imshow() method
## we don't want to consider stopwords so we remove those stopwords ( ex : is,a,an, the, he , she .....etc)
wordcloud = WordCloud(width=3000,height=2000,background_color="black",stopwords=STOPWORDS).generate(" ".join(df['City  Location']))

In [None]:
plt.figure(figsize=(30, 20))
plt.axis('off')
plt.imshow(wordcloud) 
plt.show()

In [None]:
df.isnull().sum() 

In [None]:
## fill nan values in SubVertical
df['SubVertical'] = df['SubVertical'].fillna("others")
counter = 1

## fill nan values in Investors Name
for i in range(len(df)):
    iname = df.loc[i,'Investors Name']
    if pd.isnull(iname):
        df.loc[i,'Investors Name'] = f"Name {counter}"
        counter+=1
        

df.isnull().sum()
        

In [None]:
# remove nan values in InvestmentType
df = df.dropna(subset=['InvestmentnType'])
df

In [None]:
df.isnull().sum()

In [None]:
## reset index
df = df.reset_index(drop=True)
df

### 2.4) preprocessing on InvestmentnType

In [None]:
seed_angel_funding = ['Seed/ Angel Funding','Seed / Angel Fundin','Seed/Angel Funding','Angel / Seed Funding','Seed / Angle Funding']
seed_funding = ['Seed Funding','Seed\\nFunding','Seed','Seed funding','Seed Funding Round']
debt_funding = ['Debt Funding','Debt-Funding','Debt']
df['InvestmentnType'].value_counts()
df.loc[df['InvestmentnType'].isin(seed_funding),"InvestmentnType"] = "Seed Funding"
df.loc[df['InvestmentnType'].isin(seed_angel_funding),"InvestmentnType"] = "Seed / Angel Funding"
df.loc[df['InvestmentnType'].isin(debt_funding),"InvestmentnType"] = "Debt Funding"
df['InvestmentnType'].value_counts()

### 2.5) preprocessing on Amount in USD column

In [None]:
## first convert it into float
invalid_vals = []

for j in range(len(df)):
    p = df.loc[j,"Amount in USD"]
    if not pd.isnull(p):
        df.loc[j,"Amount in USD"] = p.replace(",","")
        res = re.findall("[0-9]+",p)
        if(len(res)<=0):
            invalid_vals.append([j,p])
        res1 = re.findall("[\W]+",p)
        if(len(res1)>0):
            invalid_vals.append([j,p])
        
    
df

In [None]:
invalid_vals

In [None]:
df.loc[20,"Amount in USD"] = np.nan
df.loc[33,"Amount in USD"] = np.nan
df.loc[57,"Amount in USD"] = np.nan
df.loc[86,"Amount in USD"] = np.nan
df.loc[88,"Amount in USD"] = np.nan
df.loc[109,"Amount in USD"] = np.nan
df.loc[136,"Amount in USD"] = np.nan
df.loc[80,"Amount in USD"] = 15109500
df.loc[106,"Amount in USD"] = 14342000

for i in range(len(df)):
    p = str(df.loc[i,"Amount in USD"])
    if (p.find("xa0")!=-1):
        if(p.find("N/A")!=-1):
            df.loc[i,"Amount in USD"] = np.nan
            print("N/A settal")
        else:
            df.loc[i,"Amount in USD"] = p.split("xa0")[-1]
            print("Amount settaled")

In [None]:
df['Amount in USD'] = df['Amount in USD'].astype("float")

let's fill nan values in amount column.

In [None]:
df['Amount in USD'].describe()

In [None]:
## for now we fill nan values with median value 
df['Amount in USD'].mean() , df['Amount in USD'].median()

In [None]:
df['Amount in USD'] = df['Amount in USD'].fillna(df['Amount in USD'].median())
df.isnull().sum()

In [None]:
## remarks columns is not much useful for data analysis and modeling so drop it
df = df.drop("Remarks",axis=1)
df.dtypes

## Let's start with data visualization 

In [None]:
def countplotForCategory(col1):
    t = df[col1].value_counts()[:10].index
    sns.countplot(y=col1,order=t,data=df)
    plt.title(f"Countplot for {col1}")
    plt.show()

countplotForCategory(col1 = "Industry Vertical")

In [None]:
countplotForCategory("City  Location")

In [None]:
countplotForCategory("Investors Name")

In [None]:
## above we can see that there are Undisclosed Investors and Undisclosed investors  meaning of both category is same just spelling is different let's make it one

In [None]:
disclosed_invet_names = ['Undisclosed Investors','Undisclosed Investor','Undisclosed investor','undisclosed investors','undisclosed investor','Undisclosed investors','Undisclosed','Undisclosed']

df.loc[df['Investors Name'].isin(disclosed_invet_names) ,"Investors Name"] = "Undisclosed"


In [None]:
countplotForCategory("Investors Name")

In [None]:
countplotForCategory("InvestmentnType")

In [None]:
countplotForCategory("InvestmentnType")

## Hope you learn something from this , Thanks for watching
## Please give a upvote if you have learnt something :) Happy Learning