In [1]:
import pandas as pd
import numpy as np
import time

In [None]:
df = pd.read_csv("startup_funding.csv", encoding = 'utf-8')
df.head()

## Filtering out startups from New Delhi and Bangalore

In [6]:
#handling wrong city names and foreign city names
df['CityLocation'].dropna(inplace=True) 
df['CityLocation']=df['CityLocation'].apply(lambda x: str(x).split('/')[0].strip()) 
df['CityLocation'].replace("Delhi","New Delhi",inplace=True) 
df['CityLocation'].replace("bangalore","Bangalore",inplace=True) 

#converting AmountInUSD to int after removing commas
df.AmountInUSD.fillna("0", inplace = True)
df.AmountInUSD = df.AmountInUSD.apply(lambda amount: int(str(amount).replace(",", "")))

#city columns and funding column
df2 = df.loc[:,["CityLocation", "AmountInUSD"]]
#Bangalore and New Delhi only
bn = df2[(df2.CityLocation == "Bangalore") | (df2.CityLocation == "New Delhi")]
bn.reset_index(drop = True, inplace = True)
bn.head()

Unnamed: 0,CityLocation,AmountInUSD
0,Bangalore,1300000
1,New Delhi,0
2,Bangalore,1000000
3,Bangalore,8500000
4,Bangalore,0


In [None]:
#population average
pop = bn.AmountInUSD.mean()
print(bn.AmountInUSD.mean())

## Sampling without replacement

In [59]:
#sample mean
sample = bn.sample(50, random_state = 2)
sam = sample.AmountInUSD.mean()
print(sam)

3417720.0


In [62]:
#sampling error
print("Sampling Error =", pop-sam)

Sampling Error = 7560035.4765625


## Sampling with replacement

In [63]:
#sample mean
sample = bn.sample(50, replace = True, random_state = 2)
samr = sample.AmountInUSD.mean()
print(samr)

8238800.0


In [64]:
#sampling error
print("Sampling Error =", pop-samr)

Sampling Error = 2738955.4765625


In [12]:
pop = pd.DataFrame([1,2,3,4,5,6])
pop.sample(5, replace = True, random_state = 2)

Unnamed: 0,0
0,1
5,6
0,1
3,4
2,3


## groupby?

In [12]:
bn.groupby("CityLocation")['AmountInUSD'].sum()

CityLocation
Bangalore    8422974108
New Delhi    2818247500
Name: AmountInUSD, dtype: int64

## Stratified Sampling - using counts

In [17]:
bn.CityLocation.value_counts()

Bangalore    635
New Delhi    389
Name: CityLocation, dtype: int64

In [18]:
bn.CityLocation.value_counts().sum()

1024

In [21]:
635/1024

0.6201171875

In [22]:
389/1024

0.3798828125

In [24]:
b = bn[bn.CityLocation == "Bangalore"]
n = bn[bn.CityLocation == "New Delhi"]
b.head(), n.head()

(  CityLocation  AmountInUSD
 0    Bangalore      1300000
 2    Bangalore      1000000
 3    Bangalore      8500000
 4    Bangalore            0
 5    Bangalore      1000000,
    CityLocation  AmountInUSD
 1     New Delhi            0
 17    New Delhi            0
 18    New Delhi            0
 22    New Delhi            0
 23    New Delhi     25000000)

### Basketball dataset

In [39]:
bb = pd.DataFrame({'team': ['A', 'A', 'B', 'B', 'B', 'B', 'B', 'B'],
                   'position': ['G', 'G', 'F', 'G', 'F', 'F', 'C','C'],
                   'assists': [5, 7, 7, 8, 5, 7, 6, 9],
                   'rebounds': [11, 8, 10, 6, 6, 9, 6, 10]})
bb.head()

Unnamed: 0,team,position,assists,rebounds
0,A,G,5,11
1,A,G,7,8
2,B,F,7,10
3,B,G,8,6
4,B,F,5,6


In [34]:
bb.groupby('team', group_keys = False).apply(lambda x: x.sample(2))

Unnamed: 0,team,position,assists,rebounds
1,A,G,7,8
0,A,G,5,11
6,B,C,6,6
3,B,G,8,6


In [40]:
N = 4
bb.groupby('team',group_keys=False).apply(lambda x: x.sample(int(np.rint(N*(len(x)/len(df)))))).sample(frac=1).reset_index(drop=True)

Unnamed: 0,team,position,assists,rebounds


In [41]:
N = 4
bb.groupby('team',group_keys=False).apply(lambda x: x.sample(int(np.rint(N*(len(x)/len(df))))))

Unnamed: 0,team,position,assists,rebounds


# Activity - Stratified Sampling

### Filtering out startups from select cities

In [78]:
df = pd.read_csv("startup_funding.csv", encoding = 'utf-8')

#handling wrong city names and foreign city names
df['CityLocation'].dropna(inplace=True) 
df['CityLocation']=df['CityLocation'].apply(lambda x: str(x).split('/')[0].strip()) 
df['CityLocation'] = df['CityLocation'].str.title()
df['CityLocation'].replace(["Delhi", "Gurgaon", "Gurugram", "Noida", "New Delhi"],"NCR",inplace=True)


#converting AmountInUSD to int after removing commas
df.AmountInUSD.fillna("0", inplace = True)
df.AmountInUSD = df.AmountInUSD.apply(lambda amount: int(str(amount).replace(",", "")))

In [79]:
#city columns and funding column
df2 = df.loc[:,["StartupName", "CityLocation", "AmountInUSD"]]
#Major Cities only
mc = df2[df2.CityLocation.isin(["Bangalore", "NCR", "Mumbai", "Pune", "Hyderabad"])]
mc.reset_index(drop = True, inplace = True)
mc.head()

Unnamed: 0,StartupName,CityLocation,AmountInUSD
0,TouchKin,Bangalore,1300000
1,Ethinos,Mumbai,0
2,Leverage Edu,NCR,0
3,Zepo,Mumbai,500000
4,Click2Clinic,Hyderabad,850000


In [80]:
mc.CityLocation.value_counts()

NCR          709
Bangalore    635
Mumbai       449
Pune          91
Hyderabad     77
Name: CityLocation, dtype: int64

#### Statified Sampling  - sampling 20 startups from each city/region

In [88]:
sample = mc.groupby('CityLocation', group_keys = False).apply(lambda x: x.sample(20)).reset_index(drop = True)
sample

Unnamed: 0,StartupName,CityLocation,AmountInUSD
0,Baby Berry,Bangalore,1000000
1,PitStop,Bangalore,0
2,Zenify.in,Bangalore,640000
3,Zapyle,Bangalore,0
4,LoanCircle,Bangalore,0
...,...,...,...
95,ElastiRun,Pune,7000000
96,Yellowdig,Pune,650000
97,SparesHub,Pune,460000
98,SERV’D,Pune,100000


In [89]:
sample.CityLocation.value_counts()

Bangalore    20
Hyderabad    20
Mumbai       20
NCR          20
Pune         20
Name: CityLocation, dtype: int64

# Activity - Proportional Stratified Sampling

In [2]:
df = pd.read_csv("startup_funding.csv", encoding = 'utf-8')

#handling wrong city names and foreign city names
df['CityLocation'].dropna(inplace=True) 
df['CityLocation']=df['CityLocation'].apply(lambda x: str(x).split('/')[0].strip()) 
df['CityLocation'] = df['CityLocation'].str.title()
df['CityLocation'].replace(["Delhi", "Gurgaon", "Gurugram", "Noida", "New Delhi"],"NCR",inplace=True)


#converting AmountInUSD to int after removing commas
df.AmountInUSD.fillna("0", inplace = True)
df.AmountInUSD = df.AmountInUSD.apply(lambda amount: int(str(amount).replace(",", "")))

In [3]:
#city columns and funding column
df2 = df.loc[:,["StartupName", "CityLocation", "AmountInUSD"]]
#Major Cities only
mc = df2[df2.CityLocation.isin(["Bangalore", "NCR", "Mumbai", "Pune", "Hyderabad"])]
mc.reset_index(drop = True, inplace = True)
mc.head()

Unnamed: 0,StartupName,CityLocation,AmountInUSD
0,TouchKin,Bangalore,1300000
1,Ethinos,Mumbai,0
2,Leverage Edu,NCR,0
3,Zepo,Mumbai,500000
4,Click2Clinic,Hyderabad,850000


In [4]:
mc.CityLocation.value_counts()

NCR          709
Bangalore    635
Mumbai       449
Pune          91
Hyderabad     77
Name: CityLocation, dtype: int64

In [6]:
#total number of samples
N = 100
prop_sample_1 = mc.groupby('CityLocation', group_keys=False).apply(lambda x: x.sample(int(np.rint(N*len(x)/len(mc))))).sample(frac=1).reset_index(drop=True)
prop_sample_1

Unnamed: 0,StartupName,CityLocation,AmountInUSD
0,underDOGS,Mumbai,0
1,SeeDoc,NCR,1000000
2,Snapbizz,Bangalore,0
3,Raw Pressery,Mumbai,0
4,Hopping Chef,Mumbai,470000
...,...,...,...
95,Perfios,Bangalore,6100000
96,Medzin,NCR,45000
97,Lucideus,NCR,0
98,Log 9 Materials,Bangalore,0


In [8]:
#proportion of sample size:total population
f = 100/len(mc)
prop_sample_2 = mc.groupby("CityLocation", group_keys = False).apply(lambda x: x.sample(frac = f)).reset_index(drop = True)
prop_sample_2

Unnamed: 0,StartupName,CityLocation,AmountInUSD
0,Flipkart,Bangalore,1400000000
1,TouchKin,Bangalore,1300000
2,Runnr,Bangalore,7000000
3,AppBrowzer,Bangalore,500000
4,RentMojo,Bangalore,5000000
...,...,...,...
95,Fabogo,Pune,2250000
96,Anchanto,Pune,0
97,Uniken,Pune,2000000
98,MindTickle,Pune,12500000


In [9]:
prop_sample_2.CityLocation.value_counts()

NCR          36
Bangalore    32
Mumbai       23
Pune          5
Hyderabad     4
Name: CityLocation, dtype: int64

In [101]:
np.arange(1,11)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [102]:
np.repeat(np.arange(1,11), 20)

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
        4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
        7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10])