In [None]:
#Basic liberaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


#Sklearn Packages
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



#get the file paths
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
#Define a function to check the RMSE       
def check_RMSE (y_train ,train_prediction , y_test ,  test_predicition):
    print ('Root Mean squared error for the train data  =  ' , 
           mean_squared_error(y_train ,train_prediction , squared=False ))
    print ('Root Mean squared error for the test data  =  ' , 
           mean_squared_error(y_test ,test_predicition , squared=False ))

<h1>Read the CSV's </h1>

In [None]:
df_test  = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
df_test.head(2)

In [None]:
df_items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
df_items.head(2)

In [None]:
df_train  = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
df_train.head(2)

<h2> First challenge , the train and test data sets were not having the dame columns

In [None]:
# add month number 34 to the test dataset
df_test['date_block_num'] = 34
df_test = df_test[['date_block_num' , 'shop_id' , 'item_id' ]]
df_test.head(2)

In [None]:
# map the latest price for the items in the train data set to the test data set
item_price = dict(df_train.groupby('item_id')['item_price'].last().reset_index().values)
df_test['item_price'] = df_test.item_id.map(item_price)
df_test.head(2)

<h2> i will Remove the shop_id and item_id in the train dataset and not in the test dataset

In [None]:
df_train = df_train[df_train.item_id.isin (df_test.item_id)]
df_train = df_train[df_train.shop_id.isin (df_test.shop_id)]

<h2> Re-shape the train dataset and count the sum of sales per each month as required by the competition

In [None]:
df_train = df_train.groupby(['date_block_num' , 'shop_id' , 'item_id']).agg({'item_price': 'last', 'item_cnt_day': 'sum'}).reset_index()
df_train.head(2)

<h3> now (item_cnt_day)  represent the sum of sales per month for each item in each shop 

<h2> Add feature to be unique for shop and item for the test and train dataset

In [None]:
df_train['shop*item'] = df_train.shop_id *df_train.item_id
df_train.head(2)

In [None]:
df_test['shop*item'] = df_test.shop_id *df_test.item_id
df_test.head(2)

<h2> from the item dataset let's map the categories to the item_id

In [None]:
df_items.drop('item_name' , axis  = 1 , inplace = True)
item_cat = dict(df_items.values)

df_train['item_cat'] = df_train.item_id.map(item_cat)

df_train.head(2)

In [None]:
#map the categories
df_test['item_cat'] = df_test.item_id.map(item_cat)
df_test.head(2)

<h2> I will concate the two train and test datasets to remove the outliers

In [None]:
df = pd.concat([df_train , df_test])
#Normalize
df.item_price = np.log1p(df.item_price)
#fil l the missing
df.item_price = df.item_price.fillna(df.item_price.mean())
#rremove the outlier
df.item_cnt_day = df.item_cnt_day.apply(lambda x : 10 if x>10 else x)

In [None]:
df.head()

<h2> V10 : encode columns

In [None]:


def encode_the_numbers (column):
    """
    function to encode the pandas column depend on thier average target from low to high
    """
    helper_df = df.groupby(column)['item_cnt_day'].mean().sort_values(ascending = False).reset_index().reset_index()
    maper = helper_df.groupby(column)["index"].mean().to_dict()
    df[f'{column}_mean'] = df[column].map(maper)
    
    helper_df = df.groupby(column)['item_cnt_day'].sum().sort_values(ascending = False).reset_index().reset_index()
    maper = helper_df.groupby(column)["index"].sum().to_dict()
    df[f'{column}_sum'] = df[column].map(maper)
    
    helper_df = df.groupby(column)['item_cnt_day'].count().sort_values(ascending = False).reset_index().reset_index()
    maper = helper_df.groupby(column)["index"].count().to_dict()
    df[f'{column}_count'] = df[column].map(maper)



In [None]:
columns_to_encode = ['shop_id', 'item_id','shop*item', 'item_cat']
for column in columns_to_encode:
    encode_the_numbers (column)

In [None]:
corr_df = df.select_dtypes('number').drop('item_cnt_day', axis=1).corrwith(df.item_cnt_day).sort_values().reset_index().rename(columns = {'index':'feature' ,0:'correlation'})

fig , ax = plt.subplots(figsize  = (5,20))
ax.barh(y =corr_df.feature , width = corr_df.correlation )
ax.set_title('correlation between featuer and target'.title() ,
            fontsize = 16 , fontfamily = 'serif' , fontweight = 'bold')
plt.show();

<h2> split the train and test

In [None]:
df_train = df[df.item_cnt_day.notnull()]
df_train.head(2)

In [None]:
df_test = df[df.item_cnt_day.isnull()]
df_test.drop ('item_cnt_day' , axis = 1 , inplace  = True)
df_test.head(2)

<h2> prepare the X and y

In [None]:
X = df_train.drop('item_cnt_day' , axis = 1).values
y = df_train.item_cnt_day.values

## Scale the X

In [None]:
SC = MinMaxScaler()
SC.fit(X)
X = SC.transform(X)

In [None]:
x_train , x_test , y_train , y_test = train_test_split(X , y , test_size = 0.30 ,  random_state=10)

## Random forest

In [None]:
reg = RandomForestRegressor(n_estimators=25 )
reg.fit(x_train,y_train)
train_prediction  = reg.predict(x_train)
test_predicition  = reg.predict(x_test)

check_RMSE (y_train ,train_prediction , y_test ,  test_predicition)

## KNN

In [None]:
"""

knn = KNeighborsRegressor()
knn.fit(x_train,y_train)
train_prediction  = knn.predict(x_train)
test_predicition  = knn.predict(x_test)

check_RMSE (y_train ,train_prediction , y_test ,  test_predicition)
"""

## Linear reg

In [None]:
"""

lr = LinearRegression()
lr.fit(x_train,y_train)
train_prediction  = lr.predict(x_train)
test_predicition  = lr.predict(x_test)

check_RMSE (y_train ,train_prediction , y_test ,  test_predicition)
"""


## SVR

In [None]:
"""


from sklearn.svm import SVR
SVR=SVR()
SVR.fit(x_train,y_train)
train_prediction  = SVR.predict(x_train)
test_predicition  = SVR.predict(x_test)

check_RMSE (y_train ,train_prediction , y_test ,  test_predicition)

"""

## Ridge

In [None]:
"""
from sklearn.linear_model import Ridge
Ridge=Ridge()
Ridge.fit(x_train,y_train)
train_prediction  = Ridge.predict(x_train)
test_predicition  = Ridge.predict(x_test)

check_RMSE (y_train ,train_prediction , y_test ,  test_predicition)
"""

## BayesianRidge

In [None]:
"""
from sklearn.linear_model import BayesianRidge
Bayesian = BayesianRidge()
Bayesian.fit(x_train,y_train)
train_prediction  = Bayesian.predict(x_train)
test_predicition  = Bayesian.predict(x_test)

check_RMSE (y_train ,train_prediction , y_test ,  test_predicition)
"""

<h2> prepare the test data for submission

In [None]:
X_submission =df_test.values
X_submission = SC.transform(X_submission)

## Select random forest

In [None]:
predection  = reg.predict(X_submission)
sample_submission  = pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')
sample_submission.item_cnt_month = predection
sample_submission.head(2)

In [None]:
sample_submission.to_csv('submission.csv' , index = False)