In [24]:
import pandas as pd 
import re
import pylab as pl
import gc
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict, Counter
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from scipy.stats import pearsonr
from scipy.sparse import hstack
from multiprocessing import Pool


In [52]:
train_raw_data=pd.read_csv("C:\\Users\\sreek\Documents\\vg_donors_choose\\train\\train.csv")
test_raw_data=pd.read_csv("C:\\Users\\sreek\Documents\\vg_donors_choose\\test\\test.csv",low_memory=False)
res_raw_data=pd.read_csv("C:\\Users\\sreek\Documents\\vg_donors_choose\\resources\\resources.csv")


In [None]:
# Moved the contents of 'project_essay_2' to 'project_essay_4' when essay 4 is nan, 
# then we simply combine 1&2 and 3&4 to make a uniform dataset

In [53]:
train_raw_data['tr'] = 1
test_raw_data['tr'] = 0
train_raw_data['ts'] = 0
test_raw_data['ts'] = 1

combined_data = pd.concat((train_raw_data,test_raw_data))

combined_data.loc[combined_data.project_essay_4.isnull(), ['project_essay_4','project_essay_2']] = \
    combined_data.loc[combined_data.project_essay_4.isnull(), ['project_essay_2','project_essay_4']].values

combined_data[['project_essay_2','project_essay_3']] = combined_data[['project_essay_2','project_essay_3']].fillna('')

combined_data['project_essay_1'] = combined_data.apply(lambda row: ' '.join([str(row['project_essay_1']), 
                                                     str(row['project_essay_2'])]), axis=1)
combined_data['project_essay_2'] = combined_data.apply(lambda row: ' '.join([str(row['project_essay_3']),
                                                     str(row['project_essay_4'])]), axis=1)

combined_data = combined_data.drop(['project_essay_3', 'project_essay_4'], axis=1)

In [57]:
res_raw_data['Total'] = res_raw_data['quantity']*res_raw_data['price']
res_data = res_raw_data.groupby('id').agg({'description':'count',
                            'quantity':'sum',
                            'price':'sum',
                            'Total':'sum'}).rename(columns={'description':'items'})
res_data['avgPrice'] = res_data.Total / res_data.quantity
numFeatures = ['items', 'quantity', 'price', 'Total', 'avgPrice']

for func in ['min', 'max', 'mean']:
    res_data = res_data.join(res_raw_data.groupby('id').agg({'quantity':func,
                                          'price':func,
                                          'Total':func}).rename(
                                columns={'quantity':func+'Quantity',
                                         'price':func+'Price',
                                         'Total':func+'Total'}).fillna(0))
    numFeatures += [func+'Quantity', func+'Price', func+'Total']

res_data = res_data.join(res_raw_data.groupby('id').agg(
    {'description':lambda x:' '.join(x.values.astype(str))}).rename(
    columns={'description':'resource_description'}))

combined_data = combined_data.join(res_data, on='id')


combined_data['price_category'] = pl.digitize(combined_data.Total, [0, 50, 100, 250, 500, 1000, pl.inf])
numFeatures.append('price_category')

for c in ['Quantity', 'Price', 'Total']:
    combined_data['max%s_min%s'%(c,c)] = combined_data['max%s'%c] - combined_data['min%s'%c]
    numFeatures.append('max%s_min%s'%(c,c))

del res_data, train_raw_data, res_raw_data, test_raw_data
gc.collect()

3847

In [62]:
#######Statistical featues#############
combined_data['teacher_id'] = LabelEncoder().fit_transform(combined_data['teacher_id'])
combined_data['teacher_gender_unknown'] = combined_data.teacher_prefix.apply(lambda x:int(x not in ['Ms.', 'Mrs.', 'Mr.']))
numFeatures += ['teacher_number_of_previously_posted_projects','teacher_id','teacher_gender_unknown']

statFeatures = []
for col in ['school_state', 'teacher_id', 'teacher_prefix', 'teacher_gender_unknown', 'project_grade_category', 'project_subject_categories', 'project_subject_subcategories', 'teacher_number_of_previously_posted_projects']:
    Stat = combined_data[['id', col]].groupby(col).agg('count').rename(columns={'id':col+'_stat'})
    Stat /= Stat.sum()
    combined_data = combined_data.join(Stat, on=col)
    statFeatures.append(col+'_stat')

In [None]:
dateCol = 'project_submitted_datetime'
def getTimeFeatures(combined_data):
    combined_data['year'] = combined_data[dateCol].apply(lambda x: x.year)
    combined_data['month'] = combined_data[dateCol].apply(lambda x: x.month)
    combined_data['day'] = combined_data[dateCol].apply(lambda x: x.day)
    combined_data['dow'] = combined_data[dateCol].apply(lambda x: x.dayofweek)
    combined_data['hour'] = combined_data[dateCol].apply(lambda x: x.hour)
    combined_data['days'] = (combined_data[dateCol]-combined_data[dateCol].min()).apply(lambda x: x.days)
    return combined_data

combined_data[dateCol] = pd.to_datetime(combined_data[dateCol])
combined_data = getTimeFeatures(combined_data)

P_tar = combined_data[combined_data.tr==1][target].mean()
timeFeatures = ['year', 'month', 'day', 'dow', 'hour', 'days']
for col in timeFeatures:
    Stat = combined_data[['id', col]].groupby(col).agg('count').rename(columns={'id':col+'_stat'})
    Stat /= Stat.sum()
    combined_data = combined_data.join(Stat, on=col)
    statFeatures.append(col+'_stat')

numFeatures += timeFeatures
numFeatures += statFeatures

In [58]:

def getCatFeatures(data, Col):
    vectorizer = CountVectorizer(binary=True,
                                 ngram_range=(1,1),
                                 tokenizer=lambda x:[a.strip() for a in x.split(',')])
    return vectorizer.fit_transform(data[Col].fillna(''))

teach_pref = getCatFeatures(combined_data, 'teacher_prefix')
teach_ss = getCatFeatures(combined_data, 'school_state')
teach_pgc = getCatFeatures(combined_data, 'project_grade_category')
teach_psc = getCatFeatures(combined_data, 'project_subject_categories')
teach_pssc = getCatFeatures(combined_data, 'project_subject_subcategories')

cat_data = hstack((teach_pref, teach_ss, teach_pgc, teach_psc, teach_pssc))

In [59]:

porter = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()

def getTxtFeatures(data, Col, max_features=10000, ngrams=(1,2)):
    
    vectorizer = CountVectorizer(stop_words='english',
                                 preprocessor=stemmed_words,
                                 max_features=max_features,
                                 binary=True,
                                 ngram_range=ngrams)
    X = vectorizer.fit_transform(data[Col])
    return X

def stemmed_words(sentence):
    return ' '.join([porter.stem(x.lower()) for x in re.split('\W', sentence) if len(x) >= 1])

# def stemmed_words(doc):
    
#     return (porter.stem(w.lower()) for w in analyzer(doc))

params={"essy1_param":3000,
        "essy2_param":8000, 
        "rsummary_param":2000, 
        "rdescript_param":3000, 
        "ptitle_param":1000
        }
essy1_txt = getTxtFeatures(combined_data, 'project_essay_1', max_features=params.get("essy1_param"))
essy2_txt = getTxtFeatures(combined_data, 'project_essay_2', max_features=params.get("essy2_param"))
rsummary_txt= getTxtFeatures(combined_data, 'project_resource_summary', max_features=params.get("rsummary_param"))
rdescript_txt= getTxtFeatures(combined_data, 'resource_description', max_features=params.get("rdescript_param"), ngrams=(1,3))
ptitle_txt= getTxtFeatures(combined_data, 'project_title', max_features=params.get("ptitle_param"))

process_txt=hstack((essy1_txt,essy2_txt,rsummary_txt,rdescript_txt,ptitle_txt))
del essy1_txt,essy2_txt,rsummary_txt,rdescript_txt,ptitle_txt