In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-whitegrid')

import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, ElasticNet

from lightgbm import LGBMRegressor
#!pip install catboost
#from catboost import CatBoostRegressor
from xgboost import XGBRegressor

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.simplefilter('ignore')

In [None]:
train = pd.read_csv('./drive/My Drive/TGIH/Train.csv')
test = pd.read_csv('./drive/My Drive/TGIH/Test.csv')
sam_sub = pd.read_csv('./drive/My Drive/TGIH/Sample Submission.csv')

In [None]:
train.shape,test.shape

((284780, 8), (122049, 7))

In [None]:
train.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

In [None]:
test.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
CustomerID     0
Country        0
dtype: int64

In [None]:
train.Quantity.nunique()

373

In [None]:
cat_cols =[]
num_cols=[]
for col in train.columns:
  if train[col].nunique() < 5:
    cat_cols.append(col)
  else:
    num_cols.append(col)

In [None]:
cat_cols

[]

In [None]:
num_cols

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']

In [None]:
train.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,6141,1583,144,3,2011-05-06 16:54:00,3.75,14056.0,35
1,6349,1300,3682,6,2011-05-11 07:35:00,1.95,13098.0,35
2,16783,2178,1939,4,2011-11-20 13:20:00,5.95,15044.0,35
3,16971,2115,2983,1,2011-11-22 12:07:00,0.83,15525.0,35
4,6080,1210,2886,12,2011-05-06 09:00:00,1.65,13952.0,35


In [None]:
df = train.append(test)
df.shape

(406829, 8)

In [None]:
df.drop(columns=['CustomerID','InvoiceNo'],axis=1,inplace=True)

In [None]:
df.shape
df['InvoiceDate']= pd.to_datetime(df['InvoiceDate'])
df.info()

(406829, 6)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 122048
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   StockCode    406829 non-null  int64         
 1   Description  406829 non-null  int64         
 2   Quantity     406829 non-null  int64         
 3   InvoiceDate  406829 non-null  datetime64[ns]
 4   UnitPrice    284780 non-null  float64       
 5   Country      406829 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(4)
memory usage: 21.7 MB


In [None]:
df.head()

Unnamed: 0,StockCode,Description,Quantity,InvoiceDate,UnitPrice,Country
0,1583,144,3,2011-05-06 16:54:00,3.75,35
1,1300,3682,6,2011-05-11 07:35:00,1.95,35
2,2178,1939,4,2011-11-20 13:20:00,5.95,35
3,2115,2983,1,2011-11-22 12:07:00,0.83,35
4,1210,2886,12,2011-05-06 09:00:00,1.65,35


In [None]:
df['hour'] = df['InvoiceDate'].apply(lambda x: x.hour)
df['minute'] = df['InvoiceDate'].apply(lambda x: x.minute)

df['day'] = df['InvoiceDate'].apply(lambda x: x.day)

df['month'] = df['InvoiceDate'].apply(lambda x: x.month)

df['year'] = df['InvoiceDate'].apply(lambda x: x.year)

df.drop('InvoiceDate',axis=1,inplace=True)
df.head()

Unnamed: 0,StockCode,Description,Quantity,UnitPrice,Country,hour,minute,day,month,year
0,1583,144,3,3.75,35,16,54,6,5,2011
1,1300,3682,6,1.95,35,7,35,11,5,2011
2,2178,1939,4,5.95,35,13,20,20,11,2011
3,2115,2983,1,0.83,35,12,7,22,11,2011
4,1210,2886,12,1.65,35,9,0,6,5,2011


In [None]:
for col in ['StockCode','Description','Quantity','Country','hour','minute','day','month','year']:
  max = df[col].max()
  df[col] = df[col].apply(lambda x : x/max)

In [None]:
df.head()

Unnamed: 0,StockCode,Description,Quantity,UnitPrice,Country,hour,minute,day,month,year
0,0.429813,0.03697,3.7e-05,3.75,0.972222,0.8,0.915254,0.193548,0.416667,1.0
1,0.352973,0.945315,7.4e-05,1.95,0.972222,0.35,0.59322,0.354839,0.416667,1.0
2,0.591366,0.497818,4.9e-05,5.95,0.972222,0.65,0.338983,0.645161,0.916667,1.0
3,0.57426,0.765854,1.2e-05,0.83,0.972222,0.6,0.118644,0.709677,0.916667,1.0
4,0.328537,0.74095,0.000148,1.65,0.972222,0.45,0.0,0.193548,0.416667,1.0


In [None]:
features = [col for col in df.columns if col!='UnitPrice']
len(features)

9

In [None]:
train_set = df.head(len(train))
test_set = df.tail(len(test))

train_set.shape,test_set.shape

((284780, 10), (122049, 10))

In [None]:
test_set.drop('UnitPrice',axis=1,inplace=True)

In [None]:
train_set.shape,test_set.shape

((284780, 10), (122049, 9))

In [None]:
X = train_set.drop('UnitPrice',axis=1)
y = pd.DataFrame(train_set['UnitPrice'])

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.1,shuffle=True,random_state=42)


In [None]:
len(X_train),len(y_train)

(256302, 256302)

In [None]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error

def rmsle(y_test, y_pred):
  return np.sqrt(mean_squared_log_error(y_test, y_pred))

def av_metrices(y_test, y_pred):
  return np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression()
_ = clf.fit(X_train, y_train)

pred_val = clf.predict(X_val)

print(f'av_score {av_metrices(y_val, pred_val)}')

av_score 236.92848710153606


In [None]:
final_preds = clf.predict(test_set)
final_preds

array([[3.77767914],
       [1.57874921],
       [1.65341505],
       ...,
       [2.9460287 ],
       [3.19281797],
       [3.68219369]])

In [None]:
sam_sub.head()

Unnamed: 0,UnitPrice
0,100
1,100
2,100
3,100
4,100


In [None]:
sam_sub['UnitPrice'] = final_preds
sam_sub.head()

Unnamed: 0,UnitPrice
0,3.777679
1,1.578749
2,1.653415
3,2.985732
4,5.371108


In [None]:
sam_sub.to_csv('linear_reg.csv',index=False)

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
clf = DecisionTreeRegressor(random_state=42)
_ = clf.fit(X_train, y_train)

pred_val = clf.predict(X_val)

print(f'av_score {av_metrices(y_val, pred_val)}')

av_score 233.3925915341553


In [None]:
test_preds = clf.predict(test_set)
sam_sub['UnitPrice'] = test_preds
sam_sub.to_csv('DCT_Base.csv',index=False)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

hyperparm_com = {
    'max_depth': [4, 8, 10, 12, 16, 20,24,28,32],
    'min_samples_split': [2, 10, 20, 30, 40,50,60,70],
    'max_features': [0.2, 0.4, 0.6, 0.8, 1],
    'max_leaf_nodes': [8, 16, 32, 64, 128,256]
}

clf = RandomizedSearchCV(DecisionTreeRegressor(),
                         hyperparm_com,
                         scoring='neg_root_mean_squared_error',
                         random_state=42,
                         n_iter=50)

search = clf.fit(X_train, y_train)
search.best_params_

KeyboardInterrupt: ignored

In [None]:
best_params = {'max_depth': 32,
 'max_features': 0.4,
 'max_leaf_nodes': 256,
 'min_samples_split': 30}


clf = DecisionTreeRegressor(**best_params)
_ = clf.fit(X_train, y_train)

pred_val = clf.predict(X_val)

print(f'av_score {np.sqrt(av_metrices(y_val, pred_val))}')


av_score 236.35986283208658


In [None]:
test_preds = clf.predict(test_set)
sam_sub['UnitPrice'] = test_preds
sam_sub.to_csv('DCT_Tuned.csv',index=False)

In [None]:
TARGET_COL = 'UnitPrice'
features

['StockCode',
 'Description',
 'Quantity',
 'Country',
 'hour',
 'minute',
 'day',
 'month',
 'year']

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.model_selection import StratifiedKFold

def clf_run(clf, train, test, features):
  N_SPLITS = 5

  oofs = np.zeros(len(train))
  preds = np.zeros(len(test))

  target = train[TARGET_COL]

  folds = StratifiedKFold(n_splits=N_SPLITS)
  stratified_target = pd.qcut(train[TARGET_COL], 10, labels=False, duplicates='drop')

  for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
    print(f'\n------------- Fold {fold_ + 1} -------------')

    ############# Get train, validation and test sets along with targets ################
  
    ### Training Set
    X_trn, y_trn = train[features].iloc[trn_idx], target.iloc[trn_idx]

    ### Validation Set
    X_val, y_val = train[features].iloc[val_idx], target.iloc[val_idx]

    ### Test Set
    X_test = test[features]

    scaler = StandardScaler()
    _ = scaler.fit(X_trn)

    X_trn = scaler.transform(X_trn)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    

    _ = clf.fit(X_trn, y_trn)

    ### Instead of directly predicting the classes we will obtain the probability of positive class.
    preds_val = clf.predict(X_val)
    preds_test = clf.predict(X_test)

    fold_score = av_metrices(y_val, preds_val)
    print(f'\nAV metric score for validation set is {fold_score}')

    oofs[val_idx] = preds_val
    preds += preds_test / N_SPLITS


  oofs_score = av_metrices(target, oofs)
  print(f'\n\nAV metric for oofs is {oofs_score}')

  return oofs, preds

In [None]:
dt_params = best_params = {'max_depth': 32,
 'max_features': 0.4,
 'max_leaf_nodes': 256,
 'min_samples_split': 30}

clf = DecisionTreeRegressor(**dt_params)

dt_oofs, dt_preds = clf_run(clf, train_set, test_set, features)


------------- Fold 1 -------------

AV metric score for validation set is 36.71977658785585

------------- Fold 2 -------------

AV metric score for validation set is 24.821450868841623

------------- Fold 3 -------------

AV metric score for validation set is 164.1633839698759

------------- Fold 4 -------------

AV metric score for validation set is 41.8811497041692

------------- Fold 5 -------------

AV metric score for validation set is 42.470699642351846


AV metric for oofs is 80.58765920535373


In [None]:
dt_preds

array([2.89570142, 3.64278074, 1.74684386, ..., 2.65031577, 4.58577915,
       3.86138711])

In [None]:
sam_sub['UnitPrice'] = dt_preds
sam_sub.to_csv('DCT_GSCV.csv',index=False)

In [None]:
clf = LGBMRegressor()
lgb_oofs, lgb_preds = clf_run(clf, train_set, test_set, features)


------------- Fold 1 -------------

AV metric score for validation set is 30.106851648730526

------------- Fold 2 -------------

AV metric score for validation set is 28.675940191193277

------------- Fold 3 -------------

AV metric score for validation set is 162.08179959702375

------------- Fold 4 -------------

AV metric score for validation set is 32.37678024676782

------------- Fold 5 -------------

AV metric score for validation set is 42.21894699981584


AV metric for oofs is 78.5238017515429


In [None]:
clf = LGBMRegressor()
_ = clf.fit(X_train, y_train)

pred_val = clf.predict(X_val)

print(f'av_score {av_metrices(y_val, pred_val)}')

av_score 234.97966817878032


In [None]:
test_preds = clf.predict(test_set)
sam_sub['UnitPrice'] = test_preds
sam_sub.to_csv('LGM_Base.csv',index=False)

In [None]:
!pip install catboost
from catboost import CatBoostRegressor

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/52/39/128fff65072c8327371e3c594f3c826d29c85b21cb6485980353b168e0e4/catboost-0.24.2-cp36-none-manylinux1_x86_64.whl (66.1MB)
[K     |████████████████████████████████| 66.2MB 48kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.2


In [None]:
model_cat = CatBoostRegressor(od_type='Iter', iterations=25000, task_type='GPU')
model_cat.fit(X_train, y_train.astype(int),
              eval_set=(X_val, y_val.astype(int)),
              early_stopping_rounds=100,
              )

Learning rate set to 0.022123
0:	learn: 24.2622342	test: 236.9204394	best: 236.9204394 (0)	total: 10.9ms	remaining: 4m 31s
1:	learn: 24.2123455	test: 236.8896090	best: 236.8896090 (1)	total: 20.4ms	remaining: 4m 14s
2:	learn: 24.1715200	test: 236.8776931	best: 236.8776931 (2)	total: 29.4ms	remaining: 4m 5s
3:	learn: 24.1322888	test: 236.8660518	best: 236.8660518 (3)	total: 38.6ms	remaining: 4m 1s
4:	learn: 24.0542133	test: 236.8332596	best: 236.8332596 (4)	total: 47.6ms	remaining: 3m 57s
5:	learn: 24.0163956	test: 236.8012410	best: 236.8012410 (5)	total: 56.6ms	remaining: 3m 55s
6:	learn: 23.9751167	test: 236.7752452	best: 236.7752452 (6)	total: 66ms	remaining: 3m 55s
7:	learn: 23.9403034	test: 236.7474430	best: 236.7474430 (7)	total: 75.2ms	remaining: 3m 54s
8:	learn: 23.9061092	test: 236.7173591	best: 236.7173591 (8)	total: 84.4ms	remaining: 3m 54s
9:	learn: 23.8853598	test: 236.7069726	best: 236.7069726 (9)	total: 93.5ms	remaining: 3m 53s
10:	learn: 23.8518380	test: 236.6970225	best

<catboost.core.CatBoostRegressor at 0x7fdb867e2e10>

In [None]:
importance = pd.DataFrame(data=model_cat.feature_importances_, index=X_train.columns, columns=['imp']).sort_values(by='imp',ascending=False)
imp_feat = importance[importance['imp'] > 0.05].index
imp_feat

Index(['StockCode', 'Country', 'Quantity', 'minute', 'day', 'Description',
       'month', 'hour', 'year'],
      dtype='object')

In [None]:
model_cat = CatBoostRegressor(od_type='Iter', iterations=25000, task_type='GPU')
model_cat.fit(X_train[imp_feat], y_train.astype(int),
              eval_set=(X_val[imp_feat], y_val.astype(int)),
              early_stopping_rounds=1000,
              )

Learning rate set to 0.022123
0:	learn: 24.2622342	test: 236.9204394	best: 236.9204394 (0)	total: 10.3ms	remaining: 4m 16s
1:	learn: 24.2158839	test: 236.8921040	best: 236.8921040 (1)	total: 19.4ms	remaining: 4m 2s
2:	learn: 24.1747739	test: 236.8800175	best: 236.8800175 (2)	total: 28.6ms	remaining: 3m 58s
3:	learn: 24.1339948	test: 236.8628070	best: 236.8628070 (3)	total: 37.6ms	remaining: 3m 55s
4:	learn: 24.0951792	test: 236.8513737	best: 236.8513737 (4)	total: 46.8ms	remaining: 3m 53s
5:	learn: 24.0573806	test: 236.8191013	best: 236.8191013 (5)	total: 56ms	remaining: 3m 53s
6:	learn: 24.0061690	test: 236.8015826	best: 236.8015826 (6)	total: 65.2ms	remaining: 3m 52s
7:	learn: 23.9629591	test: 236.7785862	best: 236.7785862 (7)	total: 74.2ms	remaining: 3m 51s
8:	learn: 23.9432745	test: 236.7718378	best: 236.7718378 (8)	total: 83.3ms	remaining: 3m 51s
9:	learn: 23.9089945	test: 236.7615106	best: 236.7615106 (9)	total: 92.3ms	remaining: 3m 50s
10:	learn: 23.8906099	test: 236.7550844	bes

<catboost.core.CatBoostRegressor at 0x7fdb732f54e0>

In [None]:
prediction = model_cat.predict(test_set)

In [None]:
prediction

array([2.3907287 , 2.92558009, 2.14348898, ..., 2.5537197 , 2.61082718,
       2.50859096])

In [None]:
sam_sub['UnitPrice'] =prediction
sam_sub.to_csv('Catboost_base.csv',index=False)

In [None]:
clf = XGBRegressor(n_estimators = 1000,
                    max_depth = 6,
                    learning_rate = 0.05,
                    colsample_bytree = 0.5,
                    random_state=42)

_ = clf.fit(X_train, y_train)

pred_val = clf.predict(X_val)

print(f'av_score {av_metrices(y_val, pred_val)}')
test_preds = clf.predict(test_set)
sam_sub['UnitPrice'] = test_preds
sam_sub.to_csv('XGB_base.csv',index=False)

av_score 229.02328443582076


In [None]:
clf = RandomForestRegressor(random_state=42)
_ = clf.fit(X_train, y_train)

pred_val = clf.predict(X_val)

print(f'av_score {av_metrices(y_val, pred_val)}')
test_preds = clf.predict(test_set)
sam_sub['UnitPrice'] = test_preds
sam_sub.to_csv('RFR_base.csv',index=False)

av_score 233.77634446712833


In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
from sklearn.model_selection import RandomizedSearchCV

hyperparm_com = {
    'max_depth': [4, 8, 10, 12, 16, 20],
    'min_samples_split': [2, 10, 20, 30,50,60,70],
    'max_features': [0.2, 0.4, 0.6, 0.8, 1],
    'max_leaf_nodes': [8, 16, 32, 64, 128]
}

clf = RandomizedSearchCV(RandomForestRegressor(),
                         hyperparm_com,
                         scoring='neg_root_mean_squared_error',
                         random_state=42,
                         n_iter=20)

search = clf.fit(X_train, y_train)
search.best_params_

{'max_depth': 10,
 'max_features': 0.8,
 'max_leaf_nodes': 128,
 'min_samples_split': 2}

In [None]:
best_params = {'max_depth': 10,
 'max_features': 0.8,
 'max_leaf_nodes': 128,
 'min_samples_split': 2}

clf = RandomForestRegressor(**best_params)
_ = clf.fit(X_train, y_train)

pred_val = clf.predict(X_val)

print(f'av_score {np.sqrt(av_metrices(y_val, pred_val))}')
test_preds = clf.predict(test_set)
sam_sub['UnitPrice'] = test_preds
sam_sub.to_csv('RFR_tuned.csv',index=False)

av_score 15.302663343120889
