In [1]:
import lux
import pandas as pd
from datetime import datetime
from currency_converter import CurrencyConverter

In [2]:
cc = CurrencyConverter()
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
df = pd.read_csv("./data/projects.csv", encoding='Windows-1252', na_values=[None], parse_dates=['start_date','end_date'], date_parser=dateparse)

In [3]:
numeric_features = ["age", "goal", "elapsed_days"]
categorical_features = ["category", "subcategory", "country", "sex"]

In [4]:

# Remove useless states and merge others
df = df.drop(df[df.state.str.upper().isin(["LIVE", "SUSPENDED", "UNDEFINED"])].index)
df.state = df.state.replace("canceled", "failed")

# Impute missing country values with the currencies
df.country.fillna(df.currency.apply(lambda c: c[:2] if c != 'EUR' else None), inplace=True)

# Downsampling the data
min_occurences = min(df.groupby(['state']).size().reset_index(drop=True))
df = pd.concat([
    df[df.state.isin([col])].sample(min_occurences) for col in ["failed", "successful"]
])

# Get elapsed time in days
df['elapsed_days'] = df.apply(lambda row: (row.end_date - row.start_date).days, axis=1)

# Remove elements with more than 365 days
df = df[df['elapsed_days'] <= 365]

# Normalize Currency
df['goal'] = df.apply(lambda row: cc.convert(row.goal, row.currency, 'USD'), axis=1)
df = df.drop(['currency'], axis=1)

df = df.drop(['id', 'name', 'start_date', 'end_date'], axis=1)

# Fill missing numerical values
df[numeric_features] = df[numeric_features].fillna(df.median().round(1))

for c in categorical_features:
    value = df[c].mode()[0]
    df[c] = df[c].fillna(value=value)



In [6]:
# df.save_as_html('crowdfunding.html')
df

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()