In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MaxAbsScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [39]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

### Data Import

In [2]:
train = pd.read_csv("./data/train_20171226.csv")
test = pd.read_csv("./data/yancheng_testB_20180224.csv")
print(train.shape,test.shape)

(20157, 32) (140, 3)


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
train.shape

(20157, 32)

In [4]:
test.shape

(140, 3)

In [8]:
train.head()

Unnamed: 0,sale_date,class_id,sale_quantity,brand_id,compartment,type_id,level_id,department_id,TR,gearbox_type,...,engine_torque,car_length,car_width,car_height,total_quality,equipment_quality,rated_passenger,wheelbase,front_track,rear_track
0,201609,289403,94,12,2,1,1,1,6,MT,...,170.0,4440,1833,1545,1695,1320,5,2700,1556,1562
1,201609,745137,435,637,3,2,1,2,6,DCT,...,159.0,4534,1823,1483,1711,1336,5,2648,1553,1544
2,201609,714860,180,831,3,2,2,3,6,AT,...,176.0,4720,1815,1465,1860,1459,5,2770,1579,1589
3,201609,175962,40,750,3,2,1,4,6,AT,...,155.0,4475,1706,1469,1625,1145,5,2603,1460,1500
4,201609,270690,19,98,2,3,3,1,5,MT,...,146.5,4415,1685,1850,1825,1236,5,2720,1420,1440


In [24]:
print(list(train))

['sale_date', 'class_id', 'sale_quantity', 'brand_id', 'compartment', 'type_id', 'level_id', 'department_id', 'TR', 'gearbox_type', 'displacement', 'if_charging', 'price_level', 'price', 'driven_type_id', 'fuel_type_id', 'newenergy_type_id', 'emission_standards_id', 'if_MPV_id', 'if_luxurious_id', 'power', 'cylinder_number', 'engine_torque', 'car_length', 'car_width', 'car_height', 'total_quality', 'equipment_quality', 'rated_passenger', 'wheelbase', 'front_track', 'rear_track', 'sale_month', 'sale_year']


In [25]:
print(train.isna().sum())

sale_date                0
class_id                 0
sale_quantity            0
brand_id                 0
compartment              0
type_id                  0
level_id                 0
department_id            0
TR                       0
gearbox_type             0
displacement             0
if_charging              0
price_level              0
price                    0
driven_type_id           0
fuel_type_id             0
newenergy_type_id        0
emission_standards_id    0
if_MPV_id                0
if_luxurious_id          0
power                    0
cylinder_number          0
engine_torque            0
car_length               0
car_width                0
car_height               0
total_quality            0
equipment_quality        0
rated_passenger          0
wheelbase                0
front_track              0
rear_track               0
sale_month               0
sale_year                0
dtype: int64


In [26]:
print(train.dtypes)

sale_date                  int64
class_id                   int64
sale_quantity              int64
brand_id                   int64
compartment                int64
type_id                    int64
level_id                  object
department_id              int64
TR                        object
gearbox_type              object
displacement             float64
if_charging               object
price_level               object
price                     object
driven_type_id             int64
fuel_type_id              object
newenergy_type_id          int64
emission_standards_id      int64
if_MPV_id                  int64
if_luxurious_id            int64
power                     object
cylinder_number            int64
engine_torque             object
car_length                 int64
car_width                  int64
car_height                 int64
total_quality              int64
equipment_quality          int64
rated_passenger           object
wheelbase                  int64
front_trac

In [36]:
train = train.sort_values(by=['sale_date','class_id']).reset_index(drop=True)

In [40]:
train[:2]

Unnamed: 0,sale_date,class_id,sale_quantity,brand_id,compartment,type_id,level_id,department_id,TR,gearbox_type,displacement,if_charging,price_level,price,driven_type_id,fuel_type_id,newenergy_type_id,emission_standards_id,if_MPV_id,if_luxurious_id,power,cylinder_number,engine_torque,car_length,car_width,car_height,total_quality,equipment_quality,rated_passenger,wheelbase,front_track,rear_track,sale_month,sale_year
0,201201,125403,49,761,2,3,2,2,6,AT,2.4,L,35-50W,-,1,1,1,3,2,1,123,4,225,5256,1878,1772,2470,1840,7,3088,1593,1601,1,12
1,201201,125403,16,761,2,3,2,2,6,AT,2.4,L,35-50W,-,1,1,1,3,2,1,123,4,225,5213,1847,1750,2380,1840,7,3088,1593,1601,1,12


### Data Exploration

In [22]:
train['sale_month'] = train['sale_date']%100
train['sale_year'] = train['sale_date']//100%100
test['sale_month'] = test['predict_date']%100
test['sale_year'] = test['predict_date']//100%100

In [46]:
# only include the maximum sale quantity rows for each class_id & month, and put the total sales in there
# so the dataframe merge_train have unique class_id * month rows
merge_train = pd.DataFrame()
all_class_id = train['class_id'].unique().tolist()

for class_id in all_class_id:
    all_month = train.loc[train['class_id']== class_id, 'sale_date'].unique().tolist()
    for month in all_month:
        max_val  = train.loc[(train['class_id']==class_id) & (train['sale_date']==month), 'sale_quantity'].max()
        sale_sum = train.loc[(train['class_id']==class_id) & (train['sale_date']==month), 'sale_quantity'].sum()
        feat_val = train.loc[(train['class_id']==class_id) & (train['sale_date']==month) & (train['sale_quantity']==max_val), :]
        feat_val['sale_quantity'] = sale_sum
        merge_train = pd.concat([merge_train, feat_val[0:1]])
        
merge_train = merge_train.sort_values(by=['sale_date'], ascending=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [47]:
merge_train.shape

(5587, 34)

In [48]:
merge_train = merge_train[['sale_date', 'class_id', 'sale_quantity', 'sale_month', 'sale_year']]
cols = ['sale_date', 'class_id', 'sale_quantity', 'sale_month', 'sale_year']
############### Remove Nov ##################
merge_train = merge_train.loc[merge_train['sale_month']!=11, :]
test = test.loc[:, cols]
test['sale_date'] = test['sale_date'].fillna(201712)
test['sale_date'] = test['sale_date'].astype(int)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [51]:
# looking for class_id that has less than 14 training months
months_data_per_class = merge_train[['class_id','sale_quantity']].groupby(['class_id']).count().reset_index()

In [58]:
print('Number of classes with less than 14 months records :', 
      months_data_per_class.loc[months_data_per_class['sale_quantity'] < 14, 'class_id'].shape[0])

Number of classes with less than 14 months records : 34


In [50]:
print(merge_train.shape, test.shape)
class_12 = count[count['sale_quantity']<14]
class_12 = class_12.index.tolist()
all_data = pd.concat((merge_train, test)).reset_index(drop=True)
for idx in class_12:
    li = []
    li = all_data[all_data['class_id']==idx].index.tolist()
    all_data.drop(all_data.index[li],inplace=True)
    all_data = all_data.reset_index(drop=True)

(5195, 5) (140, 5)
