Code based on: https://www.kaggle.com/competitions/kdd-cup-2014-predicting-excitement-at-donors-choose/discussion/9347

In [None]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#Helper functions
def diff(a, b):
    b = set(b)
    return [aa for aa in a if aa not in b]

INPUT = "input"
#Loading CSV files
#donations = pd.read_csv('Data/donations.csv')
projects = pd.read_csv(f'{INPUT}/projects.csv.zip')
outcomes = pd.read_csv(f'{INPUT}/outcomes.csv.zip')
#resources = pd.read_csv(f'{INPUT}/resources.csv.zip')
sample = pd.read_csv(f'{INPUT}/sampleSubmission.csv.zip')
#essays = pd.read_csv(f'{INPUT}/essays.csv.zip')



print('Read data files.')

#Sort data according the project ID
#essays = essays.sort('projectid')
projects = projects.sort_values('projectid')
sample = sample.sort_values('projectid')
outcomes = outcomes.sort_values('projectid')
#donations = donations.sort('projectid')
#resources = resources.sort('projectid')

#Setting training data and test data indices
dates = np.array(projects.date_posted)
train_idx = np.where(dates < '2014-01-01')[0]
test_idx = np.where(dates >= '2014-01-01')[0]

#Filling missing values
projects = projects.fillna(method='pad') #'pad' filling is a naive way. We have better methods.

#Set target labels
labels = np.array(outcomes.is_exciting)

#Preprocessing
projects_numeric_columns = ['school_latitude', 'school_longitude',
                            'fulfillment_labor_materials',
                            'total_price_excluding_optional_support',
                            'total_price_including_optional_support']

projects_id_columns = ['projectid', 'teacher_acctid', 'schoolid', 'school_ncesid']
projects_categorial_columns = diff(diff(diff(list(projects.columns), projects_id_columns), projects_numeric_columns), 
                                   ['date_posted'])

projects_categorial_values = np.array(projects[projects_categorial_columns])

label_encoder = LabelEncoder()
projects_data = label_encoder.fit_transform(projects_categorial_values[:,0])

for i in range(1, projects_categorial_values.shape[1]):
    label_encoder = LabelEncoder()
    projects_data = np.column_stack((projects_data, label_encoder.fit_transform(projects_categorial_values[:,i])))

projects_data = projects_data.astype(float)

#One hot encoding!
enc = OneHotEncoder()
enc.fit(projects_data)
projects_data = enc.transform(projects_data)
outcomes["is_exciting"] = np.where(outcomes["is_exciting"]=="f", 0, 1)

#Predicting
train = projects_data[train_idx]
test = projects_data[test_idx]
y = outcomes["is_exciting"]

In [None]:
clf = LogisticRegression()

clf.fit(train, labels=='t')
preds = clf.predict_proba(test)[:,1]

#Save prediction into a file
sample['is_exciting'] = preds
sample.to_csv('hack_lr_predictions.csv', index = False)

clf = LGBMClassifier(max_depth=85, num_leaves=500)
clf.fit(train, labels=='t')
preds = clf.predict_proba(test)[:,1]
#Save prediction into a file
sample['is_exciting'] = preds
sample.to_csv('hack_lgbm_predictions.csv', index = False)

lr = pd.read_csv("hack_lr_predictions.csv")
lgbm = pd.read_csv("hack_lgbm_predictions.csv")
ensemble = lgbm.copy()
ensemble["is_exciting"] = lgbm["is_exciting"] * 0.75 + lr["is_exciting"] * 0.25
ensemble.to_csv(f"hack_ensemble(0.75, 0.25).csv", index=False)

Code based on https://www.kaggle.com/competitions/kdd-cup-2014-predicting-excitement-at-donors-choose/discussion/8228

In [None]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer


FOLDER = "input"
# donations = pd.read_csv(f'{FOLDER}/donations.csv.zip')
projects = pd.read_csv(f'{FOLDER}/projects.csv.zip')
outcomes = pd.read_csv(f'{FOLDER}/outcomes.csv.zip')
# resources = pd.read_csv(f'{FOLDER}/resources.csv.zip')
sample = pd.read_csv(f'{FOLDER}/sampleSubmission.csv.zip')
essays = pd.read_csv(f'{FOLDER}/essays.csv.zip')


essays = essays.sort_values('projectid')
projects = projects.sort_values('projectid')
sample = sample.sort_values('projectid')
ess_proj = pd.merge(essays, projects, on='projectid')
outcomes = outcomes.sort_values('projectid')


outcomes_arr = np.array(outcomes)


labels = outcomes_arr[:,1]

ess_proj['essay'] = ess_proj['essay'].fillna("no_text") 

ess_proj_arr = np.array(ess_proj)

train_idx = np.where(ess_proj_arr[:,-1] < '2014-01-01')[0]
test_idx = np.where(ess_proj_arr[:,-1] >= '2014-01-01')[0]


traindata = ess_proj_arr[train_idx,:]
testdata = ess_proj_arr[test_idx,:]


tfidf = TfidfVectorizer(min_df=3, max_features=1000)

tfidf.fit(traindata[:,5])
tr = tfidf.transform(traindata[:,5])
ts = tfidf.transform(testdata[:,5])

model  = LGBMClassifier()
model.fit(tr, labels=='t')
preds = model.predict_proba(ts)[:,1]

sample['is_exciting'] = preds
sample.to_csv('abhi_lgbm_predictions.csv', index = False)

model = LogisticRegression()
model.fit(tr, labels=='t')
preds = model.predict_proba(ts)[:,1]

sample['is_exciting'] = preds
sample.to_csv('abhi_lr_predictions.csv', index = False)

lgbm = pd.read_csv("abhi_lgbm_predictions.csv")
lr = pd.read_csv("abhi_lr_predictions.csv")

ensemble_abhi = lgbm.copy()
ensemble_abhi["is_exciting"] = lgbm["is_exciting"] * 0.5 + lr["is_exciting"] * 0.5
ensemble_abhi.to_csv(f'ensemble_abhi(0.5, 0.5).csv', index = False)


# Final ensemble

In [None]:
ensemble_hack = pd.read_csv("hack_ensemble(0.75, 0.25).csv")
ensemble_abhi = pd.read_csv("ensemble_abhi(0.5, 0.5).csv")
ensemble = ensemble_hack.copy()
ensemble["is_exciting"] = ensemble_hack["is_exciting"] * 0.75 + ensemble_abhi["is_exciting"] * 0.25
ensemble.to_csv(f"ensemble(0.75, 0.25).csv", index=False)