In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import sklearn
from datetime import date, timedelta
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# Change your location
train = pd.read_csv('/kaggle/input/womenintheloop-data-science-hackathon/train.csv')
test = pd.read_csv('/kaggle/input/womenintheloop-data-science-hackathon/test_QkPvNLx.csv')
sample = pd.read_csv('/kaggle/input/womenintheloop-data-science-hackathon/sample_submission_pn2DrMq.csv')

In [None]:
# Columns of the data set
print(list(train.columns))
print(list(test.columns))

In [None]:
print(train.shape)
print(test.shape)

In [None]:
# We notice that User Traffic in the train set is missing in the test data
# Lets analyse each parameter

In [None]:
train.ID.nunique()

In [None]:
train.apply(lambda x : len(x.unique()))

In [None]:
# Notice that ID is unique for each data point, So removing can remove it 

In [None]:
del train['ID']
del test['ID']

In [None]:
# Sales is the target variable and its a regression problem, lets further analyse sales

In [None]:
train.Sales.describe() # 

In [None]:
# From the description of sales, we find that minimum sales is 0 which is not possible

In [None]:
train[['Sales']].boxplot()

In [None]:
sns.distplot(train['Sales'])

In [None]:
# From the distribution of we can find that
# Sales do not follow normal distribution
# Sales is postively skewed

In [None]:
print("Skewness= ", train['Sales'].skew())
print("Kurtosis= ", train['Sales'].kurt())

In [None]:
# Since kutosis is > 3, we can conclude that Distribution is longer, has Many Outliers

In [None]:
# Analysis of Day_Number

In [None]:
# plotting on sample of dataset coz data is huge 
sampletrain= train.sample(1000)
sns.regplot(x='Day_No',y='Sales',data= sampletrain)

In [None]:
# As we can see from the above graph, We can say that Day_No is not a good estimator

In [None]:
train[['Course_ID','Sales']].boxplot()

In [None]:
train[['Course_Domain','Sales']].boxplot()

In [None]:
train[['Short_Promotion','Sales']].boxplot()

In [None]:
# From the above graphs we can conclude that data can be normalised before fiting it into a models

In [None]:
# Lets find the correlation
plt.subplots(figsize=(10,8))
sns.heatmap(train.corr(), annot= True)

In [None]:
train.corr().unstack().sort_values().drop_duplicates()

In [None]:
sns.countplot(train['Course_Domain'])

In [None]:
sns.countplot(train['Long_Promotion'])

In [None]:
sns.countplot(train['Short_Promotion'])

In [None]:
sample_train = train[train.Sales>0] # Sales cannot be zero
sample_train[sample_train['Course_ID']==132][['Day_No','Sales']].plot(x='Day_No',y='Sales',figsize=(16,4))

In [None]:
avg_sales = train.groupby(['Course_ID','Course_Domain','Course_Type','Short_Promotion','Public_Holiday','Long_Promotion'])['Sales'].mean().reset_index()

Adding a new feature by using Day_No 

In [None]:
def day_to_date(dataset):
    start = date(2018,12,31)
    dataset['Date'] = dataset['Day_No'].apply(lambda x: start + timedelta(x)) 

def day_month_year(dataset): 
    dataset['Day'] = dataset['Date'].apply(lambda x: x.day)
    dataset['Month'] = dataset['Date'].apply(lambda x: x.month)
    dataset['Year'] = dataset['Date'].apply(lambda x: x.year)

In [None]:
day_to_date(train)
day_month_year(train)
day_to_date(test)
day_month_year(test)

Since test doesnt contain User Traffic, we are removing it

In [None]:
train.drop('User_Traffic',axis =1, inplace = True)

In [None]:
print(train.columns)
print(test.columns)

In [None]:
# merge train and test
df = train.append(test)

In [None]:
df.isna().sum()

In [None]:
df['Competition_Metric'].fillna(df['Competition_Metric'].median(), inplace = True)

In [None]:
df1=pd.get_dummies(df,columns=['Course_Domain','Course_Type'],drop_first=True)

In [None]:
df1.columns


In [None]:
df1.drop(['Day_No','Date'],axis = 1,inplace = True)

In [None]:
#splitting train and test from df
train1= df1[df1['Sales'].isnull()!= True]
test1= df1[df1['Sales'].isnull()== True].drop(['Sales'], axis=1)
print(train1.shape)
print(test1.shape)

In [None]:
print(train1.columns)
print(test1.columns)

In [None]:
X_train = train1.drop('Sales',axis = 1)
Y_train = train.Sales
X_test = test1

In [None]:
print(X_train.columns)
print(Y_train)
print(X_test.columns)


In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, Y_train, test_size = 0.2, random_state = 890)

In [None]:
print(X_train.columns)
print(X_test.columns)

In [None]:
lgb_train = lgb.Dataset(X_tr, y_tr)
lgb_val = lgb.Dataset(X_val, y_val)

In [None]:
from hyperopt import STATUS_OK 
from hyperopt import hp 
from hyperopt import tpe 
from hyperopt import fmin 
from hyperopt import Trials

N_FOLDS = 5

In [None]:
 from sklearn.metrics import mean_squared_log_error

In [None]:
def rmsle(preds, lgb_train): 
    eval_name = 'rmsle' 
    eval_result = np.sqrt(mean_squared_log_error(preds, lgb_train.get_label())) 
    return (eval_name, eval_result*1000, False)

In [None]:
def objective(params, n_folds = N_FOLDS): 
    cv_results = lgb.cv(params, lgb_train, num_boost_round = 1000, nfold = 5, feval = rmsle, early_stopping_rounds = 10, seed = 50) 
    best_score = min(cv_results['rmsle-mean']) 
    return {'loss': best_score, 'params': params, 'status': STATUS_OK}

In [None]:
space = { 'task': hp.choice('task', ['train']), 'objective': hp.choice('objective', ['gamma']), 'metric' : hp.choice('metric', ['None']), 'boosting': hp.choice('boosting', ['gbdt']), 'learning_rate': hp.loguniform('learning_rate',np.log(0.003), np.log(0.5)), 'num_leaves': hp.choice('num_leaves', range(2, 100, 5)), 'max_depth': hp.choice('max_depth', range(1, 30, 5)), 'bagging_fraction': hp.uniform('bagging_fraction', 0.1, 1.0), 'bagging_freq': hp.choice('bagging_freq', range(1, 10, 1)), 'feature_fraction': hp.uniform('feature_fraction', 0.1, 1.0), 'max_bin': hp.choice('max_bin', range(200, 256, 5)), 'min_data_in_leaf': hp.choice('min_data_in_leaf', range(10, 1000, 1)), 'subsample': hp.uniform('subsample', 0.1, 1.0), 'bagging_seed': hp.choice('bagging_seed', range(1, 10, 1)), 'feature_fraction_seed': hp.choice('feature_fraction_seed', range(1, 10, 1)), }

In [None]:
evals_result = {} 
params = {
        'task': 'train',
        'objective': 'gamma',
        'metric' : 'None',
        'boosting': 'gbdt',
        'learning_rate': 0.03,
        'num_leaves': 100,
        'bagging_fraction': 0.85,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'n_estimators': 1000,
    }
def rmsle(preds, lgb_train):
    eval_name = "rmsle"
    eval_result = np.sqrt(mean_squared_log_error(preds, lgb_train.get_label()))
    return (eval_name, eval_result*1000, False)


cv_results = lgb.cv(params, lgb_train, num_boost_round = 1000, nfold = 5, feval = rmsle, early_stopping_rounds = 10, verbose_eval = 100, seed = 50)

lgbm_model = lgb.train(params, train_set = lgb_train, valid_sets = lgb_val, feval = rmsle,  evals_result = evals_result, verbose_eval = 100)

In [None]:
predictions = lgbm_model.predict(X_test)


In [None]:
len(predictions)

In [None]:
sample['Sales'] = predictions

In [None]:
sample # Submission 1

In [None]:
from sklearn.ensemble import RandomForestRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [None]:
def rmsle1000(y_true, y_pred):
    return np.sqrt(np.mean(np.power(np.log1p(y_true + 1) - np.log1p(y_pred + 1), 2))) *1000

In [None]:
from sklearn.model_selection import GridSearchCV,StratifiedKFold

In [None]:
#xgb
kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=123)

X_train = train1.drop('Sales',axis = 1)
Y_train = train.Sales
X_test = test1

cv_score =[]
i=1
for train_index,test_index in kf.split(X_train, Y_train):
    print('Fold no. = ', i)
    
    x_train, x_test = X_train.loc[train_index], X_train.loc[test_index]
    y_train, y_test = Y_train.loc[train_index], Y_train.loc[test_index]
    
    #model
    xgb = XGBRegressor(n_estimators= 500)
    xgb.fit(x_train, y_train)
    y_pred= xgb.predict(x_test)
    score = rmsle1000(y_test, y_pred)
    print('RMSLE score:',score)
    cv_score.append(score)    
    
    i+=1

In [None]:
xgb.feature_importances_

In [None]:
xgb = XGBRegressor(n_estimators= 500)
xgb.fit(X_train,Y_train)
xgb_preds = xgb.predict(X_test)

In [None]:
# for submission
lgbm = LGBMRegressor(n_estimators= 500)
lgbm.fit(X_train, Y_train)
lgbm_preds = lgbm.predict(X_test)

In [None]:
print(xgb_preds)

In [None]:
print(lgbm_preds)

In [None]:
print(train1.columns)
print(test1.columns)

In [None]:
import h2o
from h2o.automl import H2OAutoML
h2o.init()
train = h2o.H2OFrame(train1)
test = h2o.H2OFrame(test1)


In [None]:
y = "Sales"
x = list(X_train.columns)
aml = H2OAutoML(max_models = 30, max_runtime_secs=300, seed = 1)
aml.train(x = x, y = y, training_frame = train)
lb = aml.leaderboard
lb.head()
lb.head(rows=lb.nrows)
preds = aml.predict(test)
water_preds=h2o.as_list(preds) 

In [None]:
water_preds

In [None]:
sample.rename(columns = {'Sales':'Sales_by_lgbm'},inplace = True)

In [None]:
sample['XG Boost'] = xgb_preds

In [None]:
sample['LightGBM'] = lgbm_preds

In [None]:
sample['H20'] = water_preds

In [None]:
#Predictions of various modles
sample