Start off by reading in the necessary inputs for later

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#lgbm is what we choose for this fork
import lightgbm as gbm

#let us have training corpus split done for us
from sklearn.model_selection import train_test_split

#to save the model
from sklearn.externals import joblib

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# set seed for reproducibility
np.random.seed(0) 

Read in the data from both our train and test sets together.  We don''t actually need the test set until later

In [None]:
# read in our train and resources data that can be used for predictions
train_data = pd.read_csv("../input/train.csv")
resources_data = pd.read_csv("../input/resources.csv")

Let us expand train_data a bit.  In particular, we want to get the month from the project_submitted_datetime field

In [None]:
train_data['month'] = pd.DatetimeIndex(train_data['project_submitted_datetime']).month
train_data['month'].sample(5)

Now let us combine the train and resources data set into one

In [None]:
full_combined = train_data.merge(resources_data, on='id', how='inner', sort=False)
# we cannot have multiple rows
# our current strategy will be to just keep the first row
# later I would like to try the following:
# combine descriptions into a single string column and combine price into a sum total of all price expenses
combined = full_combined[~full_combined.id.duplicated(keep='first')]
dufus = full_combined.groupby('id', as_index=False)['price'].sum()
combined.loc['price'] = dufus['price']
dufus2 = full_combined.groupby('id', as_index=False)['quantity'].sum()
combined.loc['quantity'] = dufus2['quantity']
combined.dropna(subset=['id'], inplace=True)

Take a look at our resources data

In [None]:
#peek at resources
resources_data.head(5)

Take a look at out combined data to make sure we did it right

In [None]:
combined.head(5)

Are there any missing data points in our data set?

In [None]:
# get the number of missing data points per column
missing_values_count = combined.isnull().sum()

# look at the # of missing points in the first ten columns sorted descending
missing_values_count[:].sort_values(ascending=False)

In [None]:
#fill in NAs
combined['teacher_number_of_previously_posted_projects'].fillna(0.0, inplace=True)
combined['school_state'].fillna('UNK', inplace=True)
combined['price'].fillna(0.0, inplace=True)
combined['quantity'].fillna(0.0, inplace=True)

#scale previous posted projects
min_pp = min(combined['teacher_number_of_previously_posted_projects'])
max_pp = max(combined['teacher_number_of_previously_posted_projects'])
combined['teacher_number_of_previously_posted_projects'] = (combined['teacher_number_of_previously_posted_projects'] - min_pp) / (min_pp + max_pp)
combined['teacher_number_of_previously_posted_projects'].head(5)

#scale price
min_price = min(combined['price'])
max_price = max(combined['price'])
combined['price'] = (combined['price'] - min_price) / (min_price + max_price)
combined['price'].head(5)

#scale quantity
min_quantity = min(combined['quantity'])
max_quantity = max(combined['quantity'])
combined['quantity'] = (combined['quantity'] - min_quantity) / (min_quantity + max_quantity)
combined['quantity'].head(5)

# school state and month are categorical, the others are continuous.  Oops ...
train_data_exp = combined.join(pd.get_dummies(combined['school_state']))
train_data_expanded = train_data_exp.join(pd.get_dummies(train_data_exp['month']))

print(train_data_expanded.columns[:])

# now construct the feature vectors for the model
#X = train_data_expanded[list(train_data_expanded.columns[19:82]) + ['teacher_number_of_previously_posted_projects'] + ['quantity'] + ['price']]
X = train_data_expanded[list(train_data_expanded.columns[19:82]) + ['teacher_number_of_previously_posted_projects']]
y = train_data_expanded['project_is_approved']

# lgbm
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 14,
        'num_leaves': 31,
        'learning_rate': 0.025,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.85,
        'bagging_freq': 5,
        'verbose': 0,
        'num_threads': 4,
        'lambda_l2': 1.0,
        'min_gain_to_split': 0
}  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

lgb = gbm.train(params,
                gbm.Dataset(X_train, y_train), 
                valid_sets=[gbm.Dataset(X_test, y_test)],
                num_boost_round=10000,
                early_stopping_rounds=100)

# save the model for later use
joblib.dump(lgb, 'gbm_model_1.pkl') 

And now let us do the prediction since training really does not take that long.

In [1]:
# read in test and submission data
test_data = pd.read_csv("../input/test.csv")
subm = pd.read_csv('../input/sample_submission.csv')

# extract month
test_data['month'] = pd.DatetimeIndex(test_data['project_submitted_datetime']).month
test_data['month'].sample(5)

#merge in supplemental information from resources data file
full_combined_test = test_data.merge(resources_data, on='id', how='inner', sort=False)
# we cannot have multiple rows
# our current strategy will be to just keep the first row
# later I would like to try the following:
# combine descriptions into a single string column and combine price into a sum total of all price expenses
combined_test = full_combined_test[~full_combined_test.id.duplicated(keep='first')]
dufus_test = full_combined_test.groupby('id', as_index=False)['price'].sum()
combined_test['price'] = dufus_test['price']
dufus_test2 = full_combined_test.groupby('id', as_index=False)['quantity'].sum()
combined_test['quantity'] = dufus_test2['quantity']
combined_test.dropna(subset=['id'], inplace=True)

#fill in NAs
combined_test['teacher_number_of_previously_posted_projects'].fillna(0.0, inplace=True)
combined_test['school_state'].fillna('UNK', inplace=True)
combined_test['price'].fillna(0.0, inplace=True)
combined_test['quantity'].fillna(0.0, inplace=True)

#scale previous posted projects
combined_test['teacher_number_of_previously_posted_projects'] = (combined_test['teacher_number_of_previously_posted_projects'] - min_pp) / (min_pp + max_pp)
combined_test['teacher_number_of_previously_posted_projects'].head(5)

#scale price
combined_test['price'] = (combined_test['price'] - min_price) / (min_price + max_price)
combined_test['price'].head(5)

#scale quantity
combined_test['quantity'] = (combined_test['quantity'] - min_quantity) / (min_quantity + max_quantity)
combined_test['quantity'].head(5)

# school state is categorical, the other is continuous.  Oops ...
test_data_exp = combined_test.join(pd.get_dummies(test_data['school_state']))
test_data_expanded = test_data_exp.join(pd.get_dummies(test_data_exp['month']))

# now construct the feature vector for the svm
#X_t = test_data_expanded[list(test_data_expanded.columns[18:81]) + ['teacher_number_of_previously_posted_projects'] + ['quantity'] + ['price']]
X_t = test_data_expanded[list(test_data_expanded.columns[18:81]) + ['teacher_number_of_previously_posted_projects']]

pd.set_option('display.max_columns', 500)
X_t[:18].head(20)

# ok this is fast, right now.  Let us just classify
#y_t = lgb.predict(X_t, num_iteration=lgb.best_iteration)

#create submission file???
#submid = pd.DataFrame({'id': subm["id"]})
#label_cols = ['project_is_approved']
#submission = pd.concat([submid, pd.DataFrame(y_t, columns = label_cols)], axis=1)
#submission.to_csv('submission.csv', index=False)