In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import bokeh as bk
%matplotlib inline

from pylab import rcParams
rcParams['figure.figsize'] = 12,5
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['axes.titlesize'] = 14

import warnings
warnings.filterwarnings(action='ignore',category=DeprecationWarning)
warnings.filterwarnings(action='ignore',category=FutureWarning)

#view all the columns in the dataframe
pd.options.display.max_columns = None

In [None]:
#Load train and test data
sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv',parse_dates=True)
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

#Loading other files
item_categories = pd.read_csv('../input/predict-future-sales-eng-translation/categories.csv')
items = pd.read_csv('../input/predict-future-sales-eng-translation/items.csv')
shops = pd.read_csv('../input/predict-future-sales-eng-translation/shops.csv')

In [None]:
sales.head()

In [None]:
sales.info()

In [None]:
sales[['item_price','item_cnt_day']].describe()

In [None]:
#Check for Null values
sales.isnull().sum()

In [None]:
#Check for Duplicates
len(sales[sales.duplicated()==True])

In [None]:
#Dropping the duplicates
sales.drop_duplicates(keep='first', inplace=True)

In [None]:
#View test data
test.head()

In [None]:
shops.head()

In [None]:
item_categories.head()

In [None]:
items.head()

In [None]:
shops['shop_name'].nunique()

In [None]:
item_categories['category_id'].nunique()

In [None]:
items['item_id'].nunique()

In [None]:
sales['shop_id'].value_counts().sort_values(ascending = False)

In [None]:
sales['item_id'].value_counts().sort_values(ascending = False)

In [None]:
#EDA
#Histogram of Item price
sns.distplot(sales['item_price'])
plt.title('Histogram of the Item Price')
plt.show()

In [None]:
sns.distplot(sales['item_price'])
plt.xscale('log')
plt.title('Histogram of Item Price (log scale)')
plt.show()

In [None]:
sns.boxplot(sales['item_price'],orient='h')
plt.xscale('log')
plt.title('Box plot of the Item Price (log scale)')
plt.grid()
plt.show()

In [None]:
print(f"Median item price : {np.median(sales.item_price)}")

In [None]:
#Interquartile range
q3,q1 = np.percentile(sales['item_price'],[75,25])
print(f"q1 : {q1}")
print(f"q3 : {q3}")
print(f"Interquartile Range (IQR) : {q3 - q1}")

In [None]:
df1 = sales[(sales['item_price']<1000) & (sales['item_price']>=249)]
sns.distplot(df1['item_price'])
plt.title('Histogram of Item Price')
plt.show()

In [None]:
sales[sales['item_price']<0]

In [None]:
test.head()


In [None]:
sales.info()

In [None]:
IQR = q3 - q1
upper = np.where(sales['item_price']>= (q3+1.5*IQR))

#lower = np.where(sales['item_price']<= (q1-1.5*IQR))
sales.drop(upper[0], inplace = True)
sales.info()

In [None]:
test['date_block_num'] = 34
test.head()

In [None]:
data_concat = pd.concat([sales,test])
data_concat.head()

In [None]:
data_concat.drop(['ID','date'],axis=1,inplace=True)
data_concat.head()

In [None]:
data = data_concat.groupby(['date_block_num','shop_id','item_id'],as_index=False)['item_cnt_day'].apply(sum)
data.head()

In [None]:
data['shop_lag_1'] = data.groupby('shop_id')['item_cnt_day'].shift(1)
data['shop_lag_2'] = data.groupby('shop_id')['item_cnt_day'].shift(2)

data['item_lag_1'] = data.groupby('item_id')['item_cnt_day'].shift(1)
data['item_lag_2'] = data.groupby('item_id')['item_cnt_day'].shift(2)

In [None]:
data[data['shop_id']==2]['item_id'].value_counts().sort_values(ascending=False)[:20]

In [None]:
data['shop_median'] = data.groupby('shop_id')['item_cnt_day'].median()
data['shop_mean'] = data.groupby('shop_id')['item_cnt_day'].mean()

data['item_median'] = data.groupby('item_id')['item_cnt_day'].median()
data['item_mean'] = data.groupby('item_id')['item_cnt_day'].mean()

data.head()

In [None]:
data.describe().transpose()

In [None]:
data.isna().sum()

In [None]:
data.fillna(0.0,inplace=True)
data.isna().sum()

In [None]:
test_data = data[data['date_block_num']==34]
data_new = data[data['date_block_num']!=34]

split_ratio = 0.80
train_data = data_new[:int(split_ratio*len(data_new))]
valid_data = data_new[len(train_data):]

train_data.shape, test_data.shape,valid_data.shape

In [None]:
X_train = train_data.drop('item_cnt_day',axis=1)
y_train = train_data['item_cnt_day']

X_valid = valid_data.drop('item_cnt_day',axis=1)
y_valid = valid_data['item_cnt_day']

X_test = test_data.drop('item_cnt_day',axis=1)
y_test = test_data['item_cnt_day']

In [None]:
from sklearn.preprocessing import (StandardScaler,
                                   MinMaxScaler,
                                   PowerTransformer,PolynomialFeatures)
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
#evaluation metrics
from sklearn.metrics import mean_squared_error

In [None]:
def test_submission(data,model):
    
    predictions = model.predict(data)
    submission = test['ID'].to_frame()
    submission['item_cnt_month'] = predictions
    submission.to_csv('submission.csv', index=False)
    

In [None]:
preprocess = Pipeline([
    ('scaler',StandardScaler()), ('decompose',PCA(n_components=0.90))
])

#('poly_features',PolynomialFeatures(degree=2)),


X_train = preprocess.fit_transform(X_train)
X_valid = preprocess.transform(X_valid)
X_test = preprocess.transform(X_test)

In [None]:
lr = LinearRegression()
rf = RandomForestRegressor(max_depth=3, random_state=42)
gb = GradientBoostingRegressor(random_state=42)


In [None]:
def modeling(model,X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test):
    
    
    model.fit(X_train,y_train)
    train_pred = model.predict(X_train)
    valid_pred = model.predict(X_valid)
    test_pred = model.predict(X_test)
    
    print(f"Model Name : {model}")
    print(f"RMSE on the train data : {np.sqrt(mean_squared_error(y_train,train_pred))}")
    print(f"RMSE on the validation data : {np.sqrt(mean_squared_error(y_valid,valid_pred))}")

In [None]:
#modeling(lr)

In [None]:
#modeling(rf)

In [None]:
#modeling(gb)

In [None]:
from lightgbm import LGBMRegressor 
lgbm = LGBMRegressor(random_state=42)
modeling(lgbm)


In [None]:
test_submission(X_test,lgbm)