In [1]:
# 导入工具包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [3]:
# 数据读取
test_data = pd.read_csv('./data_format1/test_format1.csv')
train_data = pd.read_csv('./data_format1/train_format1.csv')
user_info = pd.read_csv('./data_format1/user_info_format1.csv')   # 特征表
user_log = pd.read_csv('./data_format1/user_log_format1.csv')     # 日志信息

In [4]:
print(train_data.columns)
print(test_data.columns)
print(user_info.columns)
print(user_log.columns)

Index(['user_id', 'merchant_id', 'label'], dtype='object')
Index(['user_id', 'merchant_id', 'prob'], dtype='object')
Index(['user_id', 'age_range', 'gender'], dtype='object')
Index(['user_id', 'item_id', 'cat_id', 'seller_id', 'brand_id', 'time_stamp',
       'action_type'],
      dtype='object')


In [5]:
print(train_data.shape)
print(test_data.shape)
print(user_info.shape)
print(user_log.shape)

(260864, 3)
(261477, 3)
(424170, 3)
(210945, 7)


In [8]:
print(train_data.info())
print('*'*50)
print(test_data.info())
print('*'*50)
print(user_info.info())   # 特征表有缺失
print('*'*50)
print(user_log.info())    # 日志有缺失

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260864 entries, 0 to 260863
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   user_id      260864 non-null  int64
 1   merchant_id  260864 non-null  int64
 2   label        260864 non-null  int64
dtypes: int64(3)
memory usage: 6.0 MB
None
**************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261477 entries, 0 to 261476
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      261477 non-null  int64  
 1   merchant_id  261477 non-null  int64  
 2   prob         0 non-null       float64
dtypes: float64(1), int64(2)
memory usage: 6.0 MB
None
**************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ----

In [12]:
column1 = ['age_range','gender']
column2 = ['cat_id','seller_id','brand_id','time_stamp','action_type']
for column in column1:
    print(column,user_info[column].isnull().sum()/user_info.shape[0])
print('='*100)
for column in column2:
    print(column,user_log[column].isnull().sum()/user_log.shape[0])

age_range 0.005226677982884221
gender 0.01517316170403376
cat_id 4.740572187062979e-06
seller_id 4.740572187062979e-06
brand_id 0.0012041053355139966
time_stamp 4.740572187062979e-06
action_type 4.740572187062979e-06


In [14]:
print(user_info.groupby(['age_range'])[['user_id']].count())
print(user_info.groupby(['gender'])[['user_id']].count())
print(train_data.groupby(['label'])[['user_id']].count())

           user_id
age_range         
0.0          92914
1.0             24
2.0          52871
3.0         111654
4.0          79991
5.0          40777
6.0          35464
7.0           6992
8.0           1266
        user_id
gender         
0.0      285638
1.0      121670
2.0       10426
       user_id
label         
0       244912
1        15952


# 特征工程

In [15]:
train = train_data.copy()
test = test_data.copy()
info = user_info.copy()  # 特征
log = user_log.copy()   # 日志

In [16]:
print(train.shape)
print(test.shape)
print(info.shape)
print(log.shape)

(260864, 3)
(261477, 3)
(424170, 3)
(210945, 7)


In [18]:
all_data = train.append(test)
# print(all_data.shape)
all_data = all_data.merge(info,on=['user_id'],how='left')

print(all_data.shape)

In [21]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 522341 entries, 0 to 522340
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      522341 non-null  int64  
 1   merchant_id  522341 non-null  int64  
 2   label        260864 non-null  float64
 3   age_range    519763 non-null  float64
 4   gender       514796 non-null  float64
dtypes: float64(3), int64(2)
memory usage: 23.9 MB


In [20]:
all_data.drop('prob',axis=1,inplace=True)

In [22]:
all_data.head()

Unnamed: 0,user_id,merchant_id,label,age_range,gender
0,34176,3906,0.0,6.0,0.0
1,34176,121,0.0,6.0,0.0
2,34176,4356,1.0,6.0,0.0
3,34176,2217,0.0,6.0,0.0
4,230784,4818,0.0,0.0,0.0
