In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sales_train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
sales_train.head()

In [None]:
sales_train.info()

In [None]:
sales_train.isnull().sum()

In [None]:
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
items.head()

In [None]:
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
shops.head()

In [None]:
item_cat = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
item_cat.head()

In [None]:
sales_train_all1 = pd.merge(sales_train , shops , how = 'inner' , on = 'shop_id')
sales_train_all1.head()

In [None]:
items_all = pd.merge(items , item_cat , how = 'inner' , on = 'item_category_id')
items_all.head()

In [None]:
sales_train_all = pd.merge(sales_train_all1 , items_all , how = 'inner' , on = 'item_id')
sales_train_all.head()

In [None]:
sales_train_all.isnull().sum()

In [None]:
sales_train_all.info()

In [None]:
sales_train_all['date'] = pd.to_datetime(sales_train_all['date'] , dayfirst = True)
sales_train_all['date'] = sales_train_all['date'].apply(lambda x: x.strftime('%Y-%m'))

In [None]:
sales_train_all.drop(columns=['item_category_name' , 'item_name' , 'shop_name' , 'date_block_num' ],inplace=True)
sales_train_all.head()

In [None]:
sales_train_all.head()

In [None]:
sales_train_all.tail()

In [None]:
sales_train_all.info()

In [None]:
corr = sales_train_all.corr()
f , ax = plt.subplots(figsize = (20,20))
sns.heatmap(corr, annot = True)

In [None]:
data_sum = sales_train_all.groupby(['item_id' , 'date' , 'shop_id' , 'item_price'] , as_index = False)['item_cnt_day'].sum()
data_sum = data_sum.pivot_table(index = ['shop_id' , 'item_id'] , columns = 'date' , values = 'item_cnt_day' , fill_value = 0)
data_sum.reset_index(inplace = True)
data_sum.head()

In [None]:
test_data = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

In [None]:
test_data = pd.merge(test_data , data_sum , on=['shop_id' , 'item_id'] , how = 'left')
test_data.drop(['ID', '2013-01'] , axis = 1 , inplace = True)
test_data = test_data.fillna(0)
test_data.head()

In [None]:
Y_train = data_sum['2014-08'].values
X_train = data_sum.drop(['2014-08'], axis = 1)
X_test = test_data

In [None]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(X_train, Y_train , test_size = 0.2 , random_state = 101)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
linear_reg = LinearRegression()
linear_reg.fit(x_train , y_train)
print('Train set mse:' , mean_squared_error(y_train , linear_reg.predict(x_train)))
print('Test set mse:' , mean_squared_error(y_test , linear_reg.predict(x_test)))
print('Test set score:', linear_reg.score(x_train,y_train))

from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators = 100)
rfr.fit(x_train,y_train)
print('Train set mse:' , mean_squared_error(y_train , rfr.predict(x_train)))
print('Test set mse:' , mean_squared_error(y_test , rfr.predict(x_test)))
print('Test set score:', rfr.score(x_train,y_train))

In [None]:
from sklearn.datasets import make_regression
from sklearn.linear_model import Ridge
rr = Ridge(alpha=100)
rr.fit(x_train , y_train)
print('Train set mse:' , mean_squared_error(y_train , rr.predict(x_train)))
print('Test set mse:' , mean_squared_error(y_test , rr.predict(x_test)))
print('Test set score:', rr.score(x_train,y_train))

In [None]:
submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
print(submission.shape)
submission.head()

In [None]:
prediction = linear_reg.predict(X_test)
prediction = list(map(round,prediction))
submission['item_cnt_month'] = prediction
submission.to_csv('prediction.csv' , index = False)
submission.head()