In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Read Files
file_list = ['../input/competitive-data-science-predict-future-sales/sales_train.csv','../input/competitive-data-science-predict-future-sales/items.csv','../input/competitive-data-science-predict-future-sales/item_categories.csv','../input/competitive-data-science-predict-future-sales/shops.csv','../input/competitive-data-science-predict-future-sales/test.csv']
df_list = []
for a in file_list :
    name = input()
    name = pd.read_csv(a)
    df_list.append(name)


#Separate Files    
sales_train = df_list[0]
items = df_list[1]
item_categories = df_list[2]
shops = df_list[3]
test = df_list[4]

#Function for checking null values 
def null_check(df) :
    return pd.DataFrame(df.isnull().sum().sort_values(ascending=False))

for a in df_list :
    print(null_check(a))

In [None]:
#Creating final df for training model and submission

train_final = sales_train.merge(items, on = 'item_id', how = 'left').merge(item_categories, on = 'item_category_id', how = 'left').merge(shops, on = 'shop_id', how = 'left')
test_final = test.merge(items, on = 'item_id', how = 'left').merge(item_categories, on = 'item_category_id', how = 'left').merge(shops, on = 'shop_id', how = 'left')

print('Total Rows Before Removing Duplicate : ', train_final.shape[0])

In [None]:
#Check duplicates in dataset and remove if any

print('Total Duplicate Rows : ',train_final.duplicated().sum())

train_final = train_final[~train_final.duplicated()]
print('Total Rows After Removing Duplicate : ',train_final.shape[0])

In [None]:
#Changing date format

train_final['date'] = pd.to_datetime(train_final['date']).dt.date
train_final.head()

In [None]:
#Create additional columns
train_final['sales'] = train_final['item_price']*train_final['item_cnt_day']
train_final['year'] = pd.DatetimeIndex(train_final['date']).year
train_final['month'] = pd.DatetimeIndex(train_final['date']).month
train_final.head()

In [None]:
for a in range(0,101,10) :
    print(f'{a}th percentile value for item_cnt_day is {np.percentile(train_final["item_cnt_day"],a)}')

In [None]:
for a in range(90,101,1) :
    print(f'{a}th percentile value for item_cnt_day is {np.percentile(train_final["item_cnt_day"],a)}')

In [None]:
#Removing Outliers from item_cnt_day columns

final = train_final[(train_final['item_cnt_day']>0)&(train_final['item_cnt_day'] < train_final['item_cnt_day'].quantile(0.96))]
final.shape

In [None]:
#Removing Outliers based on item_price column

for a in range(0,101,10) :
    print(f'{a}th percentile value for item_price is {np.percentile(final["item_price"],a)}')
    
for a in range(90,100,1) :
    print(f'{a}th percentile value for item_price is {np.percentile(final["item_price"],a)}')
    
final = final[(final['item_price'] > 0)&(final['item_price']<final['item_price'].quantile(0.92))]
final.shape

In [None]:
#Removing Outliers based on sales column

for a in range(0,101,10) :
    print(f'{a}th percentile value for item_price is {np.percentile(final["sales"],a)}')
    
for a in range(0,11,1) :
    print(f'{a}th percentile value for item_price is {np.percentile(final["sales"],a)}')
    
final = final[(final['sales'] > final['sales'].quantile(0))]
final.shape

In [None]:
date_sales = final.groupby('date', as_index=False)['sales'].sum().sort_values(by = 'date')

plt.plot(date_sales['date'], date_sales['sales'], color = 'magenta')
plt.xlabel('Date')
plt.xticks(rotation=90)
plt.ylabel('Sales')
plt.title('Daily Sales')
plt.show()

In [None]:
num_cols = ['item_price','item_cnt_day','sales']
cat_cols = ['shop_id','item_id','item_category_id']

#Visualizing numeric cols 
for a in num_cols :
    fig, ax = plt.subplots(2,1, figsize=(10,12))
    
    sns.distplot(final[a], ax = ax[0])
    sns.boxplot(final[a],ax=ax[1])
    
    ax[0].set_title('Distribution Plot')
    ax[0].set_ylabel('Frequency')
    
    ax[1].set_title('Boxplot')
    
    plt.suptitle(a.title(), fontsize = "20")
plt.show()

In [None]:
#Visualizing Categorical Columns

for a in cat_cols :
    count = final[a].astype(str).value_counts().sort_index().sort_values(ascending=False)
    fig = plt.figure(figsize=(9,6))
    ax = fig.gca()
    
    count.head(10).plot.bar(ax = ax, color = 'steelblue')
    plt.xticks(rotation=90)
    ax.set_title('Top 10 ' + a + ' counts')
    ax.set_xlabel(a)
    ax.set_ylabel('Frequency')
    
plt.show()

In [None]:
#Calculate Correlations Between Features Columns with Target Values
num_f_cols = ['item_price','item_cnt_day']

for a in num_f_cols :
    fig = plt.figure(figsize = (9,6))
    ax = fig.gca()
    
    if a == 'sales' :
        pass
    else :
        feature = final[a]
        label = final['sales']
        corr = feature.corr(label)
    
        plt.scatter(feature, label)
        plt.xlabel(a.title())
        plt.ylabel('Sales')
        ax.set_title('Sales Vs ' + a + ' : {}'.format(corr))
        
plt.show()

In [None]:
#Barplot Between Top 10 Categorical Features and Target Variable 

for a in cat_cols :
    fig = plt.figure(figsize = (9,6))
    ax = fig.gca()
    
    #Top 10 IDs
    top_10 = final.groupby(a)['sales'].sum().sort_index().sort_values(ascending = False)
    
    top_10.head(10).plot.bar(ax=ax, color = 'steelblue')
    ax.set_title('Barplot Grouped By : ' + a.title())
    ax.set_xlabel(a.title())
    ax.set_ylabel('Sales')
    plt.xticks(rotation=90)
    
plt.show()

In [None]:
monthly_item_cnt = final.groupby(['year','month','shop_id','item_id'])['item_cnt_day'].sum().reset_index()
monthly_item_cnt.head()

sns.heatmap(monthly_item_cnt.corr(), annot=True)

In [None]:
#Selecting shop_id and item_id that exist in test dataset

test_shop_ids = test['shop_id'].unique()
test_item_ids = test['item_id'].unique()

monthly_item_cnt = monthly_item_cnt[monthly_item_cnt['shop_id'].isin(test_shop_ids)]
monthly_item_cnt = monthly_item_cnt[monthly_item_cnt['item_id'].isin(test_item_ids)]

monthly_item_cnt.shape

In [None]:
monthly_item_cnt.info()

In [None]:
#Random Forest Model

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from numpy import mean, std
from pprint import pprint

X,y = monthly_item_cnt[['year','month','shop_id','item_id']], monthly_item_cnt['item_cnt_day']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 42)

random_rf = RandomForestRegressor(random_state = 42)
pprint(random_rf.get_params())

In [None]:
bootstrap = [True, False]
n_estimators = [int(x) for x in np.linspace(start=200,stop=2000, num = 10)]
max_depth = [int(x) for x in np.linspace(start=10, stop = 110, num = 11)]
max_depth.append(None)
max_features = ['auto','sqrt']
min_samples_split = [2,5,10]
min_samples_leaf = [1,2,4]

random_grid = {
    'bootstrap' : bootstrap,
    'n_estimators' : n_estimators,
    'max_depth' : max_depth,
    'max_features' : max_features,
    'min_samples_split' : min_samples_split,
    'min_samples_leaf' : min_samples_leaf
}

pprint(random_grid)

In [None]:
rf_random = RandomizedSearchCV(estimator = random_rf, param_distributions = random_grid,
                               cv = 3, n_jobs = -1, random_state = 42, n_iter = 3, scoring='neg_mean_absolute_error',
                               verbose = 2, return_train_score = True)

rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
def evaluate(model, X_test, y_test) :
    prediction = model.predict(X_test)
    error = abs(prediction - y_test)
    mape = 100 * np.mean(error/y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(error)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
random_model = rf_random.best_estimator_
evaluate(random_model, X_test, y_test)

In [None]:
#Generate item_cnt_month result using above model

test_df = test.copy()
test_df['year'] = '2015'
test_df['month'] = '11'

result_rf = random_model.predict(test_df[['year','month','shop_id','item_id']])
result_rf_df = pd.DataFrame(result_rf)

final_result = pd.merge(test_df, result_rf_df, left_index=True, right_index=True)
final_result = final_result.rename(columns={0:'item_cnt_month'})
final_result = final_result[['ID','item_cnt_month']]
final_result.shape

In [None]:
final_result.to_csv('submission.csv',index=False)