#### A Basic and Beginner friendly Kernel demonstrating use of XGB and LGBM Regressor for solving a Regression Problem. It includes data exploration, preprocessing steps, basic feature engineering and modelling steps with Stratified K fold validation technique.

* Problem Name: LearnX Sales Forcasting
* Problem type: Regression
* Evaluation Metric: (RMSLE *1000)


* Models Used: XGB and LGBM Regressor ( without hypertuning )
* Validation Technique Used: StratifiedKFold

In [None]:
#importing lib
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from datetime import date, timedelta

import matplotlib.pyplot as plt 
import seaborn as sns

from scipy import stats
from scipy.stats import norm, skew #for some statistics

from sklearn.model_selection import GridSearchCV,StratifiedKFold

In [None]:
#reading data
train= pd.read_csv('/kaggle/input/womenintheloop-data-science-hackathon/train.csv')
test= pd.read_csv('/kaggle/input/womenintheloop-data-science-hackathon/test_QkPvNLx.csv')
sample= pd.read_csv('/kaggle/input/womenintheloop-data-science-hackathon/sample_submission_pn2DrMq.csv')

print(train.shape)
print(test.shape)
print(sample.shape)

#### Data Exploration

In [None]:
print(train.columns)
print("------------")
print(test.columns)

In [None]:
train.head()

In [None]:
train.dtypes

Two categorical variables- Course_Domain and Course_Type

In [None]:
numerical_feats = train.dtypes[train.dtypes != "object"].index
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train.dtypes[train.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_feats))

In [None]:
print(train[numerical_feats].columns)
print("*"*100)
print(train[categorical_feats].columns)

In [None]:
for col in numerical_feats:
    print('{:15}'.format(col), 
          'Skewness: {:05.2f}'.format(train[col].skew()) , 
          '   ' ,
          'Kurtosis: {:06.2f}'.format(train[col].kurt())  
         )

In [None]:
#data prep: appending both train and test to preprocess together
df= train.append(test)
df.shape

#### Data Preprocessing Steps 

In [None]:
#null check and impute if any
df.isnull().sum()

* Competition metric have some null values.
* Not going to impute coz tree based models (xgb) can handle null values on its own. 

In [None]:
#df['Competition_Metric'].fillna(df['Competition_Metric'].median(), inplace = True)

In [None]:
# feature engineering
df['CM_binned'] = pd.cut(df['Competition_Metric'],bins=5, labels=['very_low','low','medium','high','very_high'])
df.CM_binned

In [None]:
df.CM_binned.value_counts()

In [None]:
# Converting Day_No to corresponding date and then date to day, month and year
# Adapted from https://www.kaggle.com/yacotaco/learnx-sales-forecasting

def day_to_date(dataset):
    start = date(2016,12,31)
    dataset['Date'] = dataset['Day_No'].apply(lambda x: start + timedelta(x)) 

def day_month_year(dataset): 
    dataset['Day'] = dataset['Date'].apply(lambda x: x.day)
    dataset['Month'] = dataset['Date'].apply(lambda x: x.month)
    dataset['Year'] = dataset['Date'].apply(lambda x: x.year)

In [None]:
day_to_date(df)
day_month_year(df)
df.head()

In [None]:
train.ID.nunique()

* ID is unique for each entry, Can drop it safely.

In [None]:
df.Course_Domain.value_counts()

In [None]:
df.Course_Type.value_counts()

In [None]:
df.dtypes

In [None]:
# Categorical col encoding
cat_cols= ['Course_Domain','Course_Type','CM_binned']
df= pd.get_dummies(df, columns= cat_cols, drop_first=True)

In [None]:
df.head()

In [None]:
plt.subplots(figsize=(16,16))
sns.heatmap(df.corr(), annot=True, square= True)

In [None]:
df.columns

In [None]:
features= ['Competition_Metric', 'Course_ID', 'Long_Promotion',
       'Public_Holiday','Short_Promotion',
       'Day', 'Month', 'Year', 'Course_Domain_Development',
       'Course_Domain_Finance & Accounting',
       'Course_Domain_Software Marketing', 'Course_Type_Degree',
       'Course_Type_Program', 'CM_binned_low', 'CM_binned_medium',
       'CM_binned_high', 'CM_binned_very_high']

In [None]:
#splitting train and test from df
train= df[df['Sales'].isnull()!= True]
test= df[df['Sales'].isnull()== True].drop(['Sales'], axis=1)
print(train.shape)
print(test.shape)

In [None]:
print('len of features= ',len(features))

#### Modelling

In [None]:
from sklearn.ensemble import RandomForestRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [None]:
#custom metric for hackathon
def rmsle1000(y_true, y_pred):
    return np.sqrt(np.mean(np.power(np.log1p(y_true + 1) - np.log1p(y_pred + 1), 2))) *1000

In [None]:
#xgb
kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=123)

X= train[features]
y= train.Sales
cv_score =[]
i=1
for train_index,test_index in kf.split(X, y):
    print('Fold no. = ', i)
    
    x_train, x_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    #model
    xgb = XGBRegressor(n_estimators= 500)
    xgb.fit(x_train, y_train)
    y_pred= xgb.predict(x_test)
    score = rmsle1000(y_test, y_pred)
    print('RMSLE score:',score)
    cv_score.append(score)    
    
    i+=1

In [None]:
#xgb mean rmsle
np.mean(cv_score)

In [None]:
# xgb feature importance
feat_importances = pd.Series(xgb.feature_importances_, index=features)
feat_importances.plot(kind='barh')

In [None]:
#lgbm
cv_score =[]
i=1
for train_index,test_index in kf.split(X, y):
    print('Fold no. = ', i)
    
    x_train, x_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    #model
    lgbm = LGBMRegressor(n_estimators= 500 )
    lgbm.fit(x_train, y_train)
    y_pred= lgbm.predict(x_test)
    score = rmsle1000(y_test, y_pred)
    print('RMSLE score:',score)
    cv_score.append(score)    
    
    i+=1

In [None]:
#lgbm mean rmsle
np.mean(cv_score)

In [None]:
# lgbm feature importances
feat_importances = pd.Series(lgbm.feature_importances_, index=features)
feat_importances.plot(kind='barh')

In [None]:
# for submission
xgb = XGBRegressor(n_estimators= 500)
xgb.fit(train[features], train.Sales)
xgb_preds = xgb.predict(test[features])

In [None]:
# for submission
lgbm = LGBMRegressor(n_estimators= 500)
lgbm.fit(train[features], train.Sales)
lgbm_preds = lgbm.predict(test[features])

In [None]:
submission = pd.DataFrame()
submission['ID'] = test['ID']
submission['Sales'] = xgb_preds
submission.to_csv('xgb.csv', index=False)
submission.head()

In [None]:
submission2 = pd.DataFrame()
submission2['ID'] = test['ID']
submission2['Sales'] = lgbm_preds
submission2.to_csv('lgbm.csv', index=False)
submission2.head()