In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import json

def preprocess(df):
    
    def train_eval_split(df):
        
        df_copy = df.copy()

        evaluate_mask = df_copy['evaluation_set']

        X_eval = df_copy.loc[evaluate_mask].iloc[:, :-1]
        X_train = df_copy.loc[~evaluate_mask].iloc[:, :-1]

        return X_train, X_eval
    
    def get_success_rates(df, column, options=None, success_column='state'):
    
        success_rates = {}

        if options is None:
            options = df[column].unique()

        for option in options:

            is_option = df[column] == option
            value_counts = df.loc[is_option][success_column].value_counts().to_dict()

            successes = value_counts[1.0] if 1.0 in value_counts.keys() else 0
            failures = value_counts[0.0] if 0.0 in value_counts.keys() else 0

            success_rate = (successes / (successes + failures))
            success_rates[option] = success_rate

        return success_rates
    
    def insert_success_rates(df, column, success_rates, drop_column=False):
    
        df_copy = df.copy()
        column_name = "{}_success_rate".format(column)

        for option, success_rate in success_rates.items():

            is_option = df[column] == option
            df_copy.loc[is_option, column_name] = success_rate

        if drop_column:
            df_copy.drop(column, axis=1, inplace=True)

        return df_copy
    
    def feature_engineer(df, drop_outliers=False, scaler=None):

        df_copy = df.copy()
        columns_to_keep = [
            'goal', 
            'country',
            'launched_at', 
            'static_usd_rate', 
            'creator', 
            'category',
            'state'
        ]
        df_copy = df_copy[columns_to_keep]

        df_copy['goal_usd'] = df_copy['goal'] * df_copy['static_usd_rate']
        df_copy.drop(['static_usd_rate', 'goal'], axis=1, inplace=True)

        if drop_outliers == True:

            GOAL_UPPER_BOUND = 3.0e4
            GOAL_LOWER_BOUND = 2.0e2

            in_upper = df_copy['goal_usd'] < GOAL_UPPER_BOUND
            in_lower = df_copy['goal_usd'] > GOAL_LOWER_BOUND

            df_copy = df_copy.loc[in_upper & in_lower]

        df_copy['has_slug'] = df_copy['creator'].apply(lambda creator: float('slug' in json.loads(creator).keys()))
        df_copy.drop('creator', axis=1, inplace=True)

        df_copy['category'] = df_copy['category'].apply(lambda category: json.loads(category)['name'])
        df_copy = insert_success_rates(df_copy, 'category', category_success_rates, drop_column=True)

        df_copy = insert_success_rates(df_copy, 'country', country_success_rates, drop_column=True)

        y = df_copy['state']
        df_copy.drop('state', axis=1, inplace=True)   
        return_array = [y]

        if scaler is None:
            scaler = MinMaxScaler()
            df_copy = pd.DataFrame(scaler.fit_transform(df_copy), columns=df_copy.columns)
            return_array.append(scaler)

        else:
            df_copy = pd.DataFrame(scaler.transform(df_copy), columns=df_copy.columns)

        return_array.insert(0, df_copy)

        return tuple(return_array)
    
    X_train, X_eval = train_eval_split(df)
    
    X_temp = X_train.copy()[['country', 'category', 'state']]
    
    country_success_rates = get_success_rates(X_temp, 'country')
    
    X_temp['category'] = X_temp['category'].apply(lambda category: json.loads(category)['name'])
    category_success_rates = get_success_rates(X_temp, 'category')
    
    _, _, scaler = feature_engineer(X_train, drop_outliers=True)
    X, y = feature_engineer(X_train, scaler=scaler)
    X_eval, _ = feature_engineer(X_eval, scaler=scaler)
    
    return X, y, X_eval

def train(X, y):
    
    model = LogisticRegression()
    model.fit(X, y)
    return model

def predict(model, X_eval):
    
    y_pred = model.predict(X_eval)
    return y_pred


In [6]:
import numpy as np

df = pd.read_csv("data/kickstarter.csv")
X, y, X_eval = preprocess(df)

model = train(X, y)
print(model.score(X, y))

y_pred = predict(model, X_eval)
print(np.unique(y_pred, return_counts=True))

0.71148
(array([0., 1.]), array([5664, 4336]))
