In [6]:
import pandas as pd
import numpy as np
import random

pd.options.mode.chained_assignment = None  # default='warn'

In [7]:
def rmsle(y, y_):
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    
    return np.sqrt(np.mean(calc))

def gini(list_of_values):
    sorted_list = sorted(list(list_of_values))
    height, area = 0, 0
  
    for value in sorted_list:
        height += value
        area += height - value / 2.
    
    fair_area = height * len(list_of_values) / 2
    return (fair_area - area) / fair_area


def normalized_gini(y_pred, y):
    normalized_gini = gini(y_pred)/gini(y)
    return normalized_gini
    

predicted_y = np.random.randint(100, size = 1000)
desired_y = np.random.randint(100, size = 1000)

print (normalized_gini(predicted_y, desired_y))

0.993591937707


In [8]:
# Load Training Data
df_train = pd.read_csv('data/train.csv', index_col = 'id')
print(df_train.shape)
df_train.head()

(595212, 58)


Unnamed: 0_level_0,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0,2,2,5,1,0,0,1,0,0,...,9,1,5,8,0,1,1,0,0,1
9,0,1,1,7,0,0,0,0,1,0,...,3,1,1,9,0,1,1,0,1,0
13,0,5,4,9,1,0,0,0,1,0,...,4,2,7,7,0,1,1,0,1,0
16,0,0,1,2,0,0,1,0,0,0,...,2,2,4,9,0,0,0,0,0,0
17,0,0,2,0,1,0,1,0,0,0,...,3,1,1,3,0,0,0,1,1,0


In [9]:
# Load Test Data
df_test = pd.read_csv('data/test.csv', index_col = 'id')
print(df_test.shape)
df_test.head()

(892816, 57)


Unnamed: 0_level_0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,8,1,0,0,1,0,0,0,...,1,1,1,12,0,1,1,0,0,1
1,4,2,5,1,0,0,0,0,1,0,...,2,0,3,10,0,0,1,1,0,1
2,5,1,3,0,0,0,0,0,1,0,...,4,0,2,4,0,0,0,0,0,0
3,0,1,6,0,0,1,0,0,0,0,...,5,1,0,5,1,0,1,0,0,0
4,5,1,7,0,0,0,0,0,1,0,...,4,0,0,4,0,1,1,0,0,1


In [12]:
# Split the Train DataSet into X and y
X = df_train.drop('target', axis=1)
y = df_train.target

# Shuffle and Split the data
# This is running a StratifiedShuffleSplit in sklearn
import sklearn.model_selection as skms
X_train, X_validation, y_train, y_validation = skms.train_test_split(X, y,
                                                                     test_size=0.2, train_size=0.8,
                                                                     random_state=42)

In [24]:
# Perform a LInear Regression
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [25]:
# Generate Metrics on Validation Set
from sklearn.metrics import mean_squared_error

y_prediction = model.predict(X_validation)
rmsle_val = rmsle(y_validation, y_prediction)
rmse_val = mean_squared_error(y_validation, y_prediction)**0.5
normalized_gini_val = normalized_gini(y_prediction, y_validation)

print('Validation Metrics')
print('Normalized gini:', normalized_gini_val)
print('Root Mean Squared Logarithmic Error:', rmsle_val)
print('Root Mean Squared Error:', rmse_val)

Validation Metrics
Normalized gini: 0.256293303236
Root Mean Squared Logarithmic Error: 0.130478185147
Root Mean Squared Error: 0.187586198309


In [15]:
# Train Linear Regression model
from xgboost import XGBRegressor
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)



XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [16]:
# Generate Metrics on Validation Set
from sklearn.metrics import mean_squared_error

y_prediction = model_xgb.predict(X_validation)
rmsle_val = rmsle(y_validation, y_prediction)
rmse_val = mean_squared_error(y_validation, y_prediction)**0.5
normalized_gini_val = normalized_gini(y_prediction, y_validation)

print('Validation Metrics')
print('Normalized gini:', normalized_gini_val)
print('Root Mean Squared Logarithmic Error:', rmsle_val)
print('Root Mean Squared Error:', rmse_val)

Validation Metrics
Normalized gini: 0.254443871121
Root Mean Squared Logarithmic Error: 0.130254730791
Root Mean Squared Error: 0.187328529255


In [21]:
# Predict on the Test Dataset
X_test = df_test
y_test = model_xgb.predict(X_test)

In [22]:
# Build the Submission Dataset
predictions = pd.DataFrame()
predictions['id'] = X_test.index
predictions['target'] = y_test.tolist()

print(predictions.shape)
print(predictions.head())

(892816, 2)
   id    target
0   0  0.026370
1   1  0.026142
2   2  0.031541
3   3  0.013401
4   4  0.037014


In [23]:
# Save Output
import time
submission_path = 'data/submission_' + str(time.time()) + '.csv'
predictions.to_csv(submission_path, index=False)