<a href="https://colab.research.google.com/github/screemix/The_Glorious_Chargers/blob/main/Line_item.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libs

In [None]:
import sys
import glob
import os
import io
import pickle
import statistics
import math
import pandas as pd
import numpy as np

import seaborn as sns

import calendar
calendar.setfirstweekday(calendar.MONDAY) # first week day

from datetime import datetime
from datetime import time
from datetime import date, timedelta

from dateutil.relativedelta import *

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.dates import MO, TU, WE, TH, FR, SA
from matplotlib.ticker import FuncFormatter

from pandas.plotting import autocorrelation_plot
from pandas.plotting import lag_plot
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from pandas.tseries.offsets import *

from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.seasonal import seasonal_decompose


%matplotlib inline

# pandas settings
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score, \
                                    ShuffleSplit, cross_val_predict, TimeSeriesSplit, ParameterSampler, PredefinedSplit
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, PolynomialFeatures, LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.metrics import r2_score,  mean_squared_error, accuracy_score, \
                            mean_absolute_error, median_absolute_error, make_scorer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.dummy import DummyRegressor
from sklearn.feature_selection import RFE, RFECV, SelectFromModel

import xgboost as xgb
from xgboost import plot_importance

# Import data

## Train

In [None]:
data_path = 'C://Users//Tanya//Desktop//Sbermarket'

In [None]:
file_1 = os.path.join(data_path, 'train/train.csv')
train = pd.read_csv(file_1)

In [None]:
train.head(10)

In [None]:
train.tail()

In [None]:
train.shape

In [None]:
train.sort_values(by=['phone_id', 'order_completed_at'], inplace = True)

In [None]:
train['Number_of_months_with_orders'] = train.groupby('phone_id')['target'].transform(np.sum)

In [None]:
train['Number_of_months_overall'] = train.groupby('phone_id').transform(np.size)

In [None]:
train.groupby('phone_id')['order_completed_at'].first()

In [None]:

train['Number_of_months_with_orders'].value_counts().plot(kind='bar')

In [None]:
train = train.assign(Latest_order = \
                     train.sort_values(['phone_id','order_completed_at'], ascending=False).groupby('phone_id')['order_completed_at'].last())

In [None]:
train = train.assign(First_order = \
                     train.groupby('phone_id')['order_completed_at'].first())

In [None]:
train['Latest_order'] = train.groupby(['phone_id'])['order_completed_at'].last()

In [None]:
 = data.resample('W').agg({
    '': np.sum, 
    '': np.sum,
    '':  np.sum, 
    '': np.sum,
    ': np.sum,
})

## Inline

In [None]:
file_pattern = 'line_items*'

In [None]:
files = glob.glob(os.path.join(data_path, file_pattern, file_pattern))
files

In [None]:
line_items = pd.DataFrame()
rows = []

for file in files:
    filename = file.split('\\')[-1]
    print('\nFile {0} is read in'.format(filename))

    line_items_tmp = pd.read_csv(file)

    rows += [line_items_tmp.shape[0]]
    line_items = line_items.append(line_items_tmp, sort=True)

# reset the index, so we do not have double indices (from the different files)
line_items = line_items.reset_index(drop=True)

In [None]:
assert line_items.shape[0] == sum(rows)

In [None]:
line_items.shape

In [None]:
# Saving combined inline_items
output_file = os.path.join(data_path, 'inline_items.pkl')

print('Save output in {}..'.format(output_file))
line_items.to_pickle(output_file)

In [None]:
line_items = pd.read_pickle(os.path.join(data_path, 'inline_items.pkl'))

In [None]:
line_items.head()

In [None]:
line_items.sort_values(by=['shipment_id'], inplace = True)

In [None]:
line_items["Sum_of_cancelled"] = line_items.groupby("shipment_id")["cancelled"].transform(sum)

In [None]:
line_items["Sum_of_replaced"] = line_items.groupby("shipment_id")["replaced"].transform(sum)

In [None]:
line_items["Quantity"] = line_items.groupby("shipment_id")["quantity"].transform(sum)

In [None]:
line_items.head(59)

In [None]:
line_items_grouped = line_items.groupby("shipment_id").first().reset_index()

In [None]:
line_items_grouped.head()

In [None]:
line_items_grouped.drop(["cancelled", "price", "product_id", "quantity", "replaced", "brand_name", "cancelled",
                        "discount", "item_id", "master_category_id"], axis = 1, inplace = True)

In [None]:
line_items_grouped.drop(["pricer"], axis = 1, inplace = True)

In [None]:
line_items_grouped.head()

In [None]:
output_file = os.path.join(data_path, 'Shipments_items.csv')

line_items_grouped.to_csv(output_file, sep = ';', index=False)

In [None]:
file_shipment_id_phone = os.path.join(data_path, 'shipment_id_phone_mapping.csv')
shipment_id_phone = pd.read_csv(file_shipment_id_phone)

In [None]:
line_items_grouped.head(20)

In [None]:
line_items_grouped[["Sum_of_cancelled"]].hist(bins = 55)

In [None]:
fig, axes = plt.subplots(figsize=(17, 5), nrows = 1, ncols = 2, sharex=True)

axes[0].hist(line_items_grouped[["Sum_of_cancelled"]], bins = 5)
axes[0].set_title(col, fontsize= 12, loc = 'left')
plt.xlim((-1, 50))
axes[0].set_xlabel('Sum_of_cancelled')
axes[1].hist(line_items_grouped[["Sum_of_replaced"]], bins = 5)
axes[1].set_title(col, fontsize= 12, loc = 'left')
axes[1].set_xlabel('Sum_of_replaced')
plt.xlim((-1, 50))
plt.show()

In [None]:
line_items_grouped[["Sum_of_replaced", "Quantity"]].hist(bins = 5)

In [None]:
merged = pd.merge(shipment_id_phone, line_items_grouped, how='outer', on="shipment_id", left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)

In [None]:
shipment_id_phone.shape

In [None]:
merged.shape

In [None]:
merged.head(17)