In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
#IMPORTING REQUIRED LIBRARIES

import pandas as pd
import numpy as np 
import os
import datetime

import matplotlib.pyplot as plt

import seaborn as sns 
sns.set_style('whitegrid')

import matplotlib

%matplotlib inline

### Loading data for Analysis

In [3]:
root_dir  = os.path.abspath('.')
data_dir  = os.path.join(root_dir, 'data')
train     = os.path.join(data_dir,'Train.csv')
test      = os.path.join(data_dir,'Test.csv' )
submt_fil = os.path.join(data_dir,'Final_submission.csv' )

In [4]:
train_df = pd.read_csv(train)
train_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,6141,1583,144,3,2011-05-06 16:54:00,3.75,14056.0,35
1,6349,1300,3682,6,2011-05-11 07:35:00,1.95,13098.0,35
2,16783,2178,1939,4,2011-11-20 13:20:00,5.95,15044.0,35
3,16971,2115,2983,1,2011-11-22 12:07:00,0.83,15525.0,35
4,6080,1210,2886,12,2011-05-06 09:00:00,1.65,13952.0,35


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284780 entries, 0 to 284779
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    284780 non-null  int64  
 1   StockCode    284780 non-null  int64  
 2   Description  284780 non-null  int64  
 3   Quantity     284780 non-null  int64  
 4   InvoiceDate  284780 non-null  object 
 5   UnitPrice    284780 non-null  float64
 6   CustomerID   284780 non-null  float64
 7   Country      284780 non-null  int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 17.4+ MB


In [6]:
train_df.shape

(284780, 8)

In [7]:
train_df.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

In [8]:
test_df = pd.read_csv(test)
test_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,CustomerID,Country
0,3139,1709,1079,3,2011-02-22 15:22:00,16161.0,35
1,617,510,3457,1,2010-12-08 12:46:00,17341.0,35
2,14653,604,694,36,2011-10-25 13:53:00,15158.0,35
3,8634,1478,3473,2,2011-06-27 12:38:00,16033.0,35
4,15546,3216,871,1,2011-11-06 16:14:00,15351.0,35


In [9]:
test_df.shape

(122049, 7)

In [10]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122049 entries, 0 to 122048
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    122049 non-null  int64  
 1   StockCode    122049 non-null  int64  
 2   Description  122049 non-null  int64  
 3   Quantity     122049 non-null  int64  
 4   InvoiceDate  122049 non-null  object 
 5   CustomerID   122049 non-null  float64
 6   Country      122049 non-null  int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 6.5+ MB


In [11]:
test_df.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
CustomerID     0
Country        0
dtype: int64

In [12]:
#Converting InvoiceDate column to datetime.

train_df['InvoiceDate'] = pd.to_datetime(train_df['InvoiceDate'], errors='coerce')
test_df['InvoiceDate'] = pd.to_datetime(test_df['InvoiceDate'], errors='coerce')

print(train_df.info())
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284780 entries, 0 to 284779
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    284780 non-null  int64         
 1   StockCode    284780 non-null  int64         
 2   Description  284780 non-null  int64         
 3   Quantity     284780 non-null  int64         
 4   InvoiceDate  284780 non-null  datetime64[ns]
 5   UnitPrice    284780 non-null  float64       
 6   CustomerID   284780 non-null  float64       
 7   Country      284780 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(5)
memory usage: 17.4 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122049 entries, 0 to 122048
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    122049 non-null  int64         
 1   StockCode    122049 non-null  int64         
 2   

### EDA and Feature Engineering on Training Dataset

In [13]:
monthMap = {1:'01',2:'02',3:'03',4:'04',5:'05',6:'06',7:'07',8:'08',9:'09',10:'10',11:'11',12:'12'}

# Extracting Month and Year from Date columns
train_df['Year'] = pd.DatetimeIndex(train_df['InvoiceDate']).year
train_df['Month'] = pd.DatetimeIndex(train_df['InvoiceDate']).month
#df.Month.apply(lambda x: datetime.strptime(str(x), '%m').strftime('%b'))
#train_df['MonthName'] = train_df.Month.apply(lambda x: datetime.datetime.strptime(str(x), '%m').strftime('%b'))
train_df['YearMonth'] = train_df["Year"].astype(str) + train_df["Month"].map(monthMap)
train_df['YearMonth'] = train_df['YearMonth'].apply(lambda x: int(x))

In [14]:
train_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Year,Month,YearMonth
0,6141,1583,144,3,2011-05-06 16:54:00,3.75,14056.0,35,2011,5,201105
1,6349,1300,3682,6,2011-05-11 07:35:00,1.95,13098.0,35,2011,5,201105
2,16783,2178,1939,4,2011-11-20 13:20:00,5.95,15044.0,35,2011,11,201111
3,16971,2115,2983,1,2011-11-22 12:07:00,0.83,15525.0,35,2011,11,201111
4,6080,1210,2886,12,2011-05-06 09:00:00,1.65,13952.0,35,2011,5,201105


In [15]:
X = train_df.drop(columns=["InvoiceNo", 'InvoiceDate','CustomerID', 'Year','YearMonth'])
y = train_df['UnitPrice']

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [19]:
X_train,X_test, y_train, y_test = train_test_split(X,y, train_size = .3, random_state = 42)

In [20]:
param_grid = { 
            "n_estimators"      : [1,5,10,20,30],
            "max_features"      : ["auto", "sqrt", "log2"],
            "min_samples_split" : [1,2,4,8,16],
            "bootstrap": [True, False],
            }

In [None]:
grid = GridSearchCV(RandomForestRegressor(), param_grid, n_jobs=-1, cv=30)
grid.fit(X_train, y_train)

In [None]:
print(grid.best_score_)  

In [None]:
print(grid.best_params_)