## To-Do:

* Explore dataset
    * Outlier Analysis
* Feature engineering
    * column with the identification if the animal has been given a name
    * column with the period of day of the outcome (0-8;9-18;19-24)
        * verify what is the working period in united states
    * identify the weekends and holidays in united states
    * season (winter, summer, ...)
    * divide the sexuponoutcome in two columns
        * sex (0, 1) for Male and Female;
        * operation (0, 1) for representing Neutered (Spayed and Neutered) and Intact;
    * transform the age to years
    
* Ideas/ToDo: 
    * check if is algorithm evaluations we shouldn't separate the approaches by cats and dogs
        * dogs tend to be returned to owner more often then cats;
        * cats are transferred more often then dogs
    * get importance features and create a model with only the most important 
    * Evaluation with log loss and not accuracy
 

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime

## Load Train and Test datasets

In [2]:
def loadTrainAndTestDatasets():
    train_df = pd.read_csv("data/train.csv")
    test_df = pd.read_csv("data/test.csv")
    return train_df, test_df

In [3]:
#pd.crosstab(train_df["OutcomeType"], train_df["OutcomeSubtype"], margins = True)

In [4]:
#Nota: a maior parte do código usa lambdas... eu preferia usar funcoes para brincar com 
#os parâmetros
#Não estou ainda a usar estas funcoes
#verificar o que é melhor (se passar a idade para dias ou se ter uma categoria) 

def has_name(name):
    if name is np.nan:
        return 0
    return 1

def calc_age_in_years(x):
    x = str(x)
    if x == 'nan': return 0
    age = int(x.split()[0])
    if x.find('year') > -1: return age 
    if x.find('month')> -1: return age / 12.
    if x.find('week')> -1: return age / 52.
    if x.find('day')> -1: return age / 365.
    else: return 0
    
def calc_age_category(x):
    if x < 3: return 0
    if x < 5: return 1
    if x < 10: return 2
    return 3

In [5]:
def part_of_day(x):
    hour = datetime.strptime(x, '%Y-%m-%d %H:%M:%S').hour
    if hour < 9: return '0-8'
    if hour < 19: return '9-18'
    return '19-24'

In [6]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
cal = calendar()
holidays = cal.holidays(start='2013-01-01 00:00:00', end='2016-02-28 00:00:00')

def is_holiday(x):
    dt = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')    
    if dt in holidays:
        return 1
    return 0

In [7]:
def get_season(x):
    dt = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    """
    convert date to month and day as integer (md), e.g. 4/21 = 421, 11/17 = 1117, etc.
    """
    m = dt.month * 100
    d = dt.day
    md = m + d

    if ((md >= 301) and (md <= 531)):
        return 'spring'
    elif ((md > 531) and (md < 901)):
        return 'summer'
    elif ((md >= 901) and (md <= 1130)):
        return 'fall'
    
    return 'winter'

In [8]:
def is_weekend(x):
    # Return the day of the week as an integer, where Monday is 0 and Sunday is 6.
    dt = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    if dt.weekday() in [5, 6]:
        return 1
    return 0

In [9]:
def normalizeColumn(data):
    from sklearn import preprocessing
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(data)
    df_normalized = pd.DataFrame(x_scaled)
    return df_normalized.values

In [10]:
def agetodays(x):
    try:
        y = x.split()
    except:
        return None 
    if 'year' in y[1]:
        return float(y[0]) * 365
    elif 'month' in y[1]:
        return float(y[0]) * (365/12)
    elif 'week' in y[1]:
        return float(y[0]) * 7
    elif 'day' in y[1]:
        return float(y[0])

In [11]:
#train_df = pd.read_csv("data/train.csv")
#test_df = pd.read_csv("data/test.csv")
#print(train_df['DateTime'][:10], train_df['DateTime'][:10].apply(part_of_day))


In [12]:
def prepareDatasets(train_df, test_df):
    
    print('-- Transformation step has begun --- ')
    ## JOIN train and test datasets
    all_data = pd.concat([train_df, test_df])
    
    train_length = train_df.shape[0]
    
    # NAME
    ## New feature has_name
    ## Drop Name column
    all_data['has_name'] = all_data.Name.apply(has_name)
    all_data.drop(['Name'], axis=1, inplace=True)
    
    print('  * NAME is completed! ')
    
    # DATETIME
    ## Split date to year, month, day
    ## Normalize values between 0 and 1
    ## New features: is_weekend, is_holiday
    ## Drop Datetime column
    all_data['Year'] = pd.DatetimeIndex(all_data['DateTime']).year
    all_data['Month'] = pd.DatetimeIndex(all_data['DateTime']).month
    all_data['Day'] = pd.DatetimeIndex(all_data['DateTime']).day
    all_data['part_of_day'] = all_data.DateTime.apply(part_of_day)
    print('       # PART OF DAY is completed! ')
    all_data['is_holiday'] = all_data.DateTime.apply(is_holiday)
    print('       # IS HOLIDAY is completed! ')
    all_data['is_weekend'] = all_data.DateTime.apply(is_weekend)
    print('       # IS WEEKEND is completed! ')
    all_data['season'] = all_data.DateTime.apply(get_season)
    print('       # SEASON is completed! ')
    all_data.drop(["DateTime"], axis=1, inplace=True)
    
    print('  * DATETIME is completed! ')
    
    #TODO: test if this has better results
    #(Maria)Transform and normalize the age into years
    #train_df["AgeInYears"] = train_df.AgeuponOutcome.apply(calc_age_in_years)
    #test_df["AgeInYears"] = test_df.AgeuponOutcome.apply(calc_age_in_years)
    
    
    # AGEUPONOUTCOME 
    ## Convert date to days
    ## Fill NaN values with the median
    ## Normalize values between 0 and 1
    ## Drop AgeuponOutcome column
    all_data['AgeUponOutcomeInDays'] = all_data['AgeuponOutcome'].map(agetodays)
    all_data.loc[(all_data['AgeUponOutcomeInDays'].isnull()),'AgeUponOutcomeInDays'] = all_data['AgeUponOutcomeInDays'].median()
    all_data.drop(["AgeuponOutcome"], axis=1, inplace=True)
    
    print('  * AGEUPONOUTCOME is completed! ')
    
    #Normalize columns
    cols_to_norm = ['Year','Month', 'Day', 'AgeUponOutcomeInDays']
    all_data[cols_to_norm] = all_data[cols_to_norm].apply(lambda x: (x - x.mean()) / (x.max() - x.min()))
    
    print('  * NORMALIZATION is completed! ')
    
    # OUTCOMETYPE
    ## Separating target variable
    train_outcome = all_data["OutcomeType"][:train_length]
    all_data.drop(["OutcomeType"], axis=1, inplace=True)
    
    print('  * OUTCOMETYPE is completed! ')
        
    # OUTCOMESUBTYPE
    ## OutcomeSubtype are deleted
    all_data.drop(["OutcomeSubtype"], axis=1, inplace=True)
    
    print('  * OUTCOMESUBTYPE is completed! ')
    
    # SEXUPONOUTCOME
    ## Fill NaN with the mode
    ## New feature_ is_spayed_or_neutered
    ## Encode categorical to numeric
    all_data['SexuponOutcome'].fillna(all_data['SexuponOutcome'].mode().iloc[0], inplace=True)
    all_data["is_spayed_or_neutered"] = all_data['SexuponOutcome'].apply(lambda e : 1 if 'spayed' in e or 'neutered' in e else 0 )

    print('  * SEXUPONOUTCOME is completed! ')
    
    # ANIMALTYPE
    ## Encode categorical to numeric
    
    
    # BREED
    ## extract new features: hair (short, medium, long) and is_mix 
    ## Encode categorical to numeric
    all_data["is_mix"] = all_data['Breed'].apply(lambda e : 1 if "Mix" in e else 0 )
    
    print('  * BREED is completed! ')
    
    # COLOR
    ## new feature: has_multiple_colors
    ## Encode categorical to numeric
    all_data["has_multiple_colors"] = all_data['Color'].apply(lambda e : 1 if len(str(e).split('/')) > 1 else 0 )
    
    print('  * COLOR is completed! ')
    
    #Deleting IDs
    all_data.drop(["AnimalID"], axis=1, inplace=True)
    all_data.drop(["ID"], axis=1, inplace=True)
    
    print('  * IDs is completed! ')
    
    
    # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
    #Encode the categorical data of all_data, column by column due memory restrictions
    AnimalType_encoded = pd.get_dummies(all_data['AnimalType'], columns='AnimalType')
    SexuponOutcome_encoded = pd.get_dummies(all_data['SexuponOutcome'], columns='SexuponOutcome')
    Breed_encoded = pd.get_dummies(all_data['Breed'], columns='Breed')
    Color_encoded = pd.get_dummies(all_data['Color'], columns='Color')
    part_of_day_encoded = pd.get_dummies(all_data['part_of_day'], columns='part_of_day')
    season_encoded = pd.get_dummies(all_data['season'], columns='season')
    
    print('  * GET DUMMIES is completed! ')
    
    # http://pandas.pydata.org/pandas-docs/stable/merging.html
    all_data.drop(["AnimalType"], axis=1, inplace=True)
    all_data.drop(["SexuponOutcome"], axis=1, inplace=True)
    all_data.drop(["Breed"], axis=1, inplace=True)
    all_data.drop(["Color"], axis=1, inplace=True)
    all_data.drop(["part_of_day"], axis=1, inplace=True)
    all_data.drop(["season"], axis=1, inplace=True)
    
    print('  * DROP GET DUMMIES is completed! ')
    
    all_data_encoded = pd.concat([all_data, part_of_day_encoded, season_encoded, AnimalType_encoded, 
                                  SexuponOutcome_encoded, Breed_encoded, Color_encoded], 
                                 axis=1)
    
    
    #Split again for train and test
    train = all_data_encoded[:train_length]
    test = all_data_encoded[train_length:]
    
    print('-- Transformation step has finished --- ')
    
    return train_outcome, train, test

## Cross-validation

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, accuracy_score, log_loss

def calculateCVMetrics(train, train_outcome, model):
    
    #Training a RF to get some metrics
    X_train, X_val, y_train, y_val = train_test_split(train, train_outcome, test_size=0.3)
    
    model.fit(X_train, y_train)
    #y_pred_val = model.predict(X_val)
    
    #print(classification_report(y_val, y_pred_val))
    #print(accuracy_score(y_val, y_pred_val))
    clf_probs = model.predict_proba(X_val)
    print('Log Loss metric')
    print(log_loss(y_val, clf_probs))


## Calculate final model (with best parameters or not)

In [14]:
from sklearn.grid_search import GridSearchCV

def CreateFinalModel(train, train_outcome, bestParams=False):
    
    
    if bestParams:
        rfc = RandomForestClassifier()
        param_grid = { 
            'n_estimators': [500],
            'max_features': ['auto', 'sqrt', 'log2']
        }
        model = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=10, n_jobs=4)
    else:
        model = RandomForestClassifier(n_estimators=250,n_jobs=5)
    
    model.fit(train, train_outcome)
    return model

## Create predictions and submission file

In [15]:
def makePredictions(model, test):
    return model.predict_proba(test)

In [16]:
def createCSVSubmissionFile(predictions, fileName):
    
    results = pd.read_csv("submissions/sample_submission.csv")
    
    results['Adoption'], results['Died'], results['Euthanasia'], results['Return_to_owner'], 
    results['Transfer'] = predictions[:,0], predictions[:,1], predictions[:,2], predictions[:,3], 
    predictions[:,4]
    results.to_csv("submissions/" + fileName, index=False)

# RUN ALL STEPS

In [17]:
train_df, test_df = loadTrainAndTestDatasets()

In [18]:
train_outcome, train, test = prepareDatasets(train_df, test_df)

-- Transformation step has begun --- 
  * NAME is completed! 
       # PART OF DAY is completed! 
       # IS HOLIDAY is completed! 
       # IS WEEKEND is completed! 
       # SEASON is completed! 
  * DATETIME is completed! 
  * AGEUPONOUTCOME is completed! 
  * NORMALIZATION is completed! 
  * OUTCOMETYPE is completed! 
  * OUTCOMESUBTYPE is completed! 
  * SEXUPONOUTCOME is completed! 
  * BREED is completed! 
  * COLOR is completed! 
  * IDs is completed! 
  * GET DUMMIES is completed! 
  * DROP GET DUMMIES is completed! 
-- Transformation step has finished --- 


In [None]:
train[['is_holiday', 'Year','Month', 'Day', 'AgeUponOutcomeInDays', '0-8', '9-18', '19-24', 'winter', 'summer',
      'has_multiple_colors', 'is_mix', 'is_spayed_or_neutered']].head()

In [None]:
test.head()

In [None]:
train_outcome.head()

In [None]:
findBestParams = False
finalModel = CreateFinalModel(train, train_outcome, findBestParams)

In [None]:
if findBestParams:
    print(finalModel.best_params_)
    calculateCVMetrics(train, train_outcome, finalModel.best_estimator_)
else:
    calculateCVMetrics(train, train_outcome, finalModel)

In [None]:
predictions = makePredictions(finalModel, test)

In [None]:
from datetime import datetime
now = datetime.now().strftime("%Y%m%d-%H%M%S")
fileName = "submission_" + now + ".csv"
createCSVSubmissionFile(predictions, fileName)

# TESTING FEATURE SELECTION

In [None]:
finalModel.feature_importances_

In [None]:
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(finalModel, prefit=True)
X_new = model.transform(train)

In [None]:
newModel = CreateFinalModel(X_new, train_outcome, findBestParams)

In [None]:
if findBestParams:
    print(newModel.best_params_)
    calculateCVMetrics(X_new, train_outcome, newModel.best_estimator_)
else:
    calculateCVMetrics(X_new, train_outcome, newModel)

In [None]:
Y_new = model.transform(test)
predictions = makePredictions(newModel, Y_new)

## XGBoost
ToDo:
- tuning parameters
- split dataset of cats and dogs and make different models for each one
- ensemble??
- add more features
- clean/organize notebook strucutre

In [None]:

from xgboost.sklearn import XGBClassifier
#modelXGB = XGBClassifier(n_estimators=1000, learning_rate = 0.03, max_depth=6, subsample=0.7, 
#        colsample_bytree = 0.7, # gamma = 0.7, # max_delta_step=0.1, 
#        reg_lambda = 4, # min_child_weight=50, 
        #seed = seed, 
#                        ) 
modelXGB = XGBClassifier() 
    
modelXGB.fit(train, train_outcome, eval_metric='mlogloss',)
calculateCVMetrics(train, train_outcome, modelXGB)

In [None]:
predictionsXGBoost = makePredictions(modelXGB, test)

In [None]:
from datetime import datetime
now = datetime.now().strftime("%Y%m%d-%H%M%S")
fileName = "submission_" + now + ".csv"
createCSVSubmissionFile(predictionsXGBoost, fileName)