In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
# Import the data
df = pd.read_csv("https://static.bc-edx.com/ai/ail-v-1-0/m14/datasets/crowdfunding-data.csv")
df.head()

Unnamed: 0,goal,pledged,backers_count,country,staff_pick,spotlight,category,days_active,outcome
0,100,0,0,3,0,0,0,17,0
1,1400,14560,158,0,0,1,1,27,1
2,108400,142523,1425,4,0,0,2,20,1
3,4200,2477,24,0,0,0,1,40,0
4,7600,5265,53,0,0,0,3,4,0


In [5]:
# Create a column "pledged_per_backer"
df['pledged_per_backer'] = df['pledged'] / df['backers_count']
df

Unnamed: 0,goal,pledged,backers_count,country,staff_pick,spotlight,category,days_active,outcome,pledged_per_backer
0,100,0,0,3,0,0,0,17,0,
1,1400,14560,158,0,0,1,1,27,1,92.151899
2,108400,142523,1425,4,0,0,2,20,1,100.016140
3,4200,2477,24,0,0,0,1,40,0,103.208333
4,7600,5265,53,0,0,0,3,4,0,99.339623
...,...,...,...,...,...,...,...,...,...,...
1124,17130,15894,847,2,0,0,5,6,0,18.765053
1125,97329,80937,862,6,0,0,3,29,0,93.894432
1126,53597,40388,58,0,0,0,9,46,0,696.344828
1127,71588,18102,274,0,0,0,2,43,0,66.065693


In [6]:
# Fill the missing values with zeros
df['pledged_per_backer'] = df['pledged_per_backer'].fillna(0)
df.head()

Unnamed: 0,goal,pledged,backers_count,country,staff_pick,spotlight,category,days_active,outcome,pledged_per_backer
0,100,0,0,3,0,0,0,17,0,0.0
1,1400,14560,158,0,0,1,1,27,1,92.151899
2,108400,142523,1425,4,0,0,2,20,1,100.01614
3,4200,2477,24,0,0,0,1,40,0,103.208333
4,7600,5265,53,0,0,0,3,4,0,99.339623


In [8]:
# Create a backers_per_day column
df['backers_per_day'] = df['backers_count'] / df['days_active']
df

Unnamed: 0,goal,pledged,backers_count,country,staff_pick,spotlight,category,days_active,outcome,pledged_per_backer,backers_per_day
0,100,0,0,3,0,0,0,17,0,0.000000,0.000000
1,1400,14560,158,0,0,1,1,27,1,92.151899,5.851852
2,108400,142523,1425,4,0,0,2,20,1,100.016140,71.250000
3,4200,2477,24,0,0,0,1,40,0,103.208333,0.600000
4,7600,5265,53,0,0,0,3,4,0,99.339623,13.250000
...,...,...,...,...,...,...,...,...,...,...,...
1124,17130,15894,847,2,0,0,5,6,0,18.765053,141.166667
1125,97329,80937,862,6,0,0,3,29,0,93.894432,29.724138
1126,53597,40388,58,0,0,0,9,46,0,696.344828,1.260870
1127,71588,18102,274,0,0,0,2,43,0,66.065693,6.372093


In [11]:
import math
float(math.inf)

inf

In [14]:
# Create a days_to_goal column

# Create a function to apply
def days_to_goal(row):
    amount_remaining = row['goal']-row['pledged']
    if row['pledged'] > row['goal']:
        return 0
    pledged_per_day = row['pledged_per_backer'] * row['backers_per_day']

    # Note that we can't divide by zero:
    # return a large number if pledged_per_day is zero
    if pledged_per_day == 0:
        return 100000
    return amount_remaining / pledged_per_day

# Apply the function
df['days_to_goal'] = df.apply(days_to_goal, axis=1)

In [15]:
df

Unnamed: 0,goal,pledged,backers_count,country,staff_pick,spotlight,category,days_active,outcome,pledged_per_backer,backers_per_day,days_to_goal
0,100,0,0,3,0,0,0,17,0,0.000000,0.000000,100000.000000
1,1400,14560,158,0,0,1,1,27,1,92.151899,5.851852,0.000000
2,108400,142523,1425,4,0,0,2,20,1,100.016140,71.250000,0.000000
3,4200,2477,24,0,0,0,1,40,0,103.208333,0.600000,27.823981
4,7600,5265,53,0,0,0,3,4,0,99.339623,13.250000,1.773979
...,...,...,...,...,...,...,...,...,...,...,...,...
1124,17130,15894,847,2,0,0,5,6,0,18.765053,141.166667,0.466591
1125,97329,80937,862,6,0,0,3,29,0,93.894432,29.724138,5.873309
1126,53597,40388,58,0,0,0,9,46,0,696.344828,1.260870,15.044419
1127,71588,18102,274,0,0,0,2,43,0,66.065693,6.372093,127.052149
