In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

**It's common to use mobile user information (such as personal basic information, consumption habits and preferences, etc.) to match and recommend the most suitable plan for users which is quite useful for follow-up personalized service.**

**This scenario is derived from mobile operators.**

**Goal is to predict users' plan (current_service) accurately.**

# Data files
First round train set and test set：train_all.csv,test_1.csv  <br />
Second round train set and test set：train_2.csv,test_2.csv <br />
**Here we just combine two together and treat as one. <br />**

# Data attributes
| Attribute     | Meaning| Type|  Comment |
|:-------:|:-------:|:-------:|:-------:|
|USERID|	User ID|	VARCHAR2(50)|	User code，primary key|
|current_service|	Plan	|VARCHAR2(500)	|plan code|
|service_type	|Plan type	|VARCHAR2(10)	|0：2G 3Gmix，1：2I2C，2：2G，3：3G，4：4G|
|is_mix_service	|If mix service|	VARCHAR2(10)|	1.True 0.False|
|online_time	|Online time|	VARCHAR2(50)	|/|
|1_total_fee|	Total billing amount current month	|NUMBER|	￥|
|2_total_fee	|Total billing amount last month|	NUMBER	|￥|
|3_total_fee|	Total billing amount last last month|	NUMBER	￥|
|4_total_fee	|Total billing amount last last last month	|NUMBER|	￥|
|month_traffic	|Month traffic	|NUMBER|	MB|
|many_over_bill|	Successive over bill	|VARCHAR2(500)|	1.True 0.False|
|contract_type|	Contract type|	VARCHAR2(500)	|ZBG_DIM.DIM_CBSS_ACTIVITY_TYPE|
|contract_time|	Contract time|	VARCHAR2(500)|	/|
|is_promise_low_consume	|If promise low consumer|	VARCHAR2(500)	|1.True 0.False|
|net_service	|Net service|	VARCHAR2(500)	|20AAAAAA-2G|
|pay_times	|Pay times	|NUMBER	|Time|
|pay_num	|Pay number	|NUMBER	|￥|
|last_month_traffic	|Last month traffic rest|	NUMBER|	MB|
|local_trafffic_month|	Local cumulative trafffic month	|NUMBER	|MB|
|local_caller_time|	Local cumulative caller time|	NUMBER|	Minute|
|service1_caller_time	|Service1_caller_time|	NUMBER	|Minute|
|service2_caller_time	|Service2_caller_time|	NUMBER	|Minute|
|gender|	Gender	|varchar2(100)	|01.male 02.femle|
|age|	Age|	varchar2(100)|	/|
|complaint_level	|Complaint level|	VARCHAR2(1000)	|1：normal，2：important，3：significant|
|former_complaint_num|Former complaint times|	NUMBER	|Time|
|former_complaint_fee|	Former complaint fee	|NUMBER	|￥|

# Environment dependency
- python3
- pandas
- scikit-learn
- gensim
- xgboost/LightGBM/Catboost

## Data exploration

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("/kaggle/input/user-package-information-of-mobile-operators/train_all.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.columns

#  Feature engineering1: time series data vectorization by word2vec

Here we convert total_fee for four consecutive months into four 10-dimension vectors, because **word2vec is good at representing sequence features**

In [None]:
import os
import pandas as pd
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing
import numpy as np

In [None]:
path_write='./'

path = '../input/user-package-information-of-mobile-operators'
# word2vec path
save_path = path_write + '/w2v'
if not os.path.exists(save_path):
    print(save_path)
    os.makedirs(save_path)

# train,test set path
train1 = pd.read_csv(path + '/train_all.csv')
train = pd.read_csv(path + '/train_2.csv')
test = pd.read_csv(path + '/test_2.csv')

In [None]:
#concat
data = pd.concat([train, test, train1]).reset_index(drop=True).sample(frac=1, random_state=2020).fillna(0)
data = data.replace('\\N', 999)



In [None]:
#word2vec
sentence = []
for line in list(data[['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']].values):
    sentence.append([str(float(l)) for idx, l in enumerate(line)])
# set vector dimension
L = 10

# word2vec training
print('word2vec start training...')
model = Word2Vec(sentence, size=L, window=2, min_count=1, workers=multiprocessing.cpu_count(), iter=10)
print('embedding vecters saved...')

# word2vec extracting
for fea in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']:
    # extract total_fee by columns
    values = []
    values = list(data[fea].values)
    values = set(values)
    
    print(len(values))
    
    # get vectors and form DataFrame
    w2v = []
    for i in values:
        a = [i]
        a.extend(model[str(float(i))])
        w2v.append(a)
    out_df = pd.DataFrame(w2v)
    
    # rename DataFrame columns
    name = [fea]
    for i in range(L):
        name.append(name[0] + 'W' + str(i))
    out_df.columns = name
    print(out_df.columns)
    out_df.to_csv(save_path + '/' + fea + '_w2v.csv', index=False)

## word2vec result visualization

In [None]:
import pandas as pd
import multiprocessing
import numpy as np
import random
import sys
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df=pd.read_csv(save_path+'/1_total_fee_w2v.csv')
l=list(df['1_total_fee'].astype('str'))
name=list(df)

# visualization
def plot_with_labels(low_dim_embs, labels, filename = './tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize= (10, 18))
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label, xy = (x, y), textcoords = 'offset points', ha = 'right', va = 'bottom')
    plt.savefig(filename) 

# t-sne dimensionality reduction (10 to 2)
tsne = TSNE(perplexity = 30, n_components = 2, init = 'pca', n_iter = 5000)
plot_only = 300
low_dim_embs = tsne.fit_transform(df.iloc[:plot_only][name[1:]])
labels = [l[i] for i in range(plot_only)]
# plot
plot_with_labels(low_dim_embs, labels)

# Feature engineering2: orginial,cumulative, time series, combination features

In [None]:
import os
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import f1_score

In [None]:
# file path

path = '../input/user-package-information-of-mobile-operators'
w2v_path = './w2v'
train = pd.read_csv(path + '/train_2.csv')
test = pd.read_csv(path + '/test_2.csv')
train_first = pd.read_csv(path + '/train_all.csv')

train['data_type'] = 0
test['data_type'] = 0
train_first['data_type'] = 1

# data concat (3 tables above)
data = pd.concat([train, test, train_first], ignore_index=True).fillna(0)

# set label: current_service
data['label'] = data.current_service.astype(int)
data = data.replace('\\N', 999)

# transform gender type to int
data['gender'] = data.gender.astype(int)

# orginial category features
origin_cate_feature = ['service_type', 'complaint_level', 'contract_type', 'gender', 'is_mix_service',
                       'is_promise_low_consume',
                       'many_over_bill', 'net_service']

# orginial number features
origin_num_feature = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee',
                      'age', 'contract_time',
                      'former_complaint_fee', 'former_complaint_num',
                      'last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic',
                      'online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time']

# number features to float
for i in origin_num_feature:
    data[i] = data[i].astype(float)

# import saved word2vec features
w2v_features = []
for col in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']:
    df = pd.read_csv(w2v_path + '/' + col + '_w2v.csv')
    df = df.drop_duplicates([col])
    fs = list(df)
    fs.remove(col)
    w2v_features += fs
    data = pd.merge(data, df, on=col, how='left')
count_feature_list = []

# cumulative features
# function for cumulative features, 2 parameters required: orginial dataframe and features(list) to be counted
def feature_count(data, features=[]):
    if len(set(features)) != len(features):
        print('equal feature !!!!')
        return data
    new_feature = 'count'
    for i in features:
        new_feature += '_' + i.replace('add_', '')
    try:
        del data[new_feature]
    except:
        pass
    temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})

    data = data.merge(temp, 'left', on=features)
    count_feature_list.append(new_feature)
#     print('temp: ',temp)
#     print('data: ',data)
    return data

# call function for counting features （groupby one feature）
data = feature_count(data, ['1_total_fee'])
data = feature_count(data, ['2_total_fee'])
data = feature_count(data, ['3_total_fee'])
data = feature_count(data, ['4_total_fee'])
data = feature_count(data, ['former_complaint_fee'])
data = feature_count(data, ['pay_num'])
data = feature_count(data, ['contract_time'])
data = feature_count(data, ['last_month_traffic'])
data = feature_count(data, ['online_time'])

# function for counting features（groupby two features）
for i in ['service_type', 'contract_type']:
    data = feature_count(data, [i, '1_total_fee'])
    data = feature_count(data, [i, '2_total_fee'])
    data = feature_count(data, [i, '3_total_fee'])
    data = feature_count(data, [i, '4_total_fee'])
    data = feature_count(data, [i, 'former_complaint_fee'])
    data = feature_count(data, [i, 'pay_num'])
    data = feature_count(data, [i, 'contract_time'])
    data = feature_count(data, [i, 'last_month_traffic'])
    data = feature_count(data, [i, 'online_time'])

# Time series features（reflect trend）
diff_feature_list = ['diff_total_fee_1', 'diff_total_fee_2', 'diff_total_fee_3', 'last_month_traffic_rest',
                     'rest_traffic_ratio',
                     'total_fee_mean', 'total_fee_max', 'total_fee_min', 'total_caller_time', 'service2_caller_ratio',
                     'local_caller_ratio',
                     'total_month_traffic', 'month_traffic_ratio', 'last_month_traffic_ratio', 'pay_num_1_total_fee',
                     '1_total_fee_call_fee', '1_total_fee_call2_fee', '1_total_fee_trfc_fee']
# Time series features: difference
data['diff_total_fee_1'] = data['1_total_fee'] - data['2_total_fee']
data['diff_total_fee_2'] = data['2_total_fee'] - data['3_total_fee']
data['diff_total_fee_3'] = data['3_total_fee'] - data['4_total_fee']
data['pay_num_1_total_fee'] = data['pay_num'] - data['1_total_fee']
data['last_month_traffic_rest'] = data['month_traffic'] - data['last_month_traffic']
data['last_month_traffic_rest'][data['last_month_traffic_rest'] < 0] = 0

# Time series features: ratio
data['rest_traffic_ratio'] = (data['last_month_traffic_rest'] * 15 / 1024) / data['1_total_fee']

# Time series features: mean,max,min
total_fee = []
for i in range(1, 5):
    total_fee.append(str(i) + '_total_fee')
data['total_fee_mean'] = data[total_fee].mean(1)
data['total_fee_max'] = data[total_fee].max(1)
data['total_fee_min'] = data[total_fee].min(1)

# Combination features: others
data['total_caller_time'] = data['service2_caller_time'] + data['service1_caller_time']
data['service2_caller_ratio'] = data['service2_caller_time'] / data['total_caller_time']
data['local_caller_ratio'] = data['local_caller_time'] / data['total_caller_time']

data['total_month_traffic'] = data['local_trafffic_month'] + data['month_traffic']
data['month_traffic_ratio'] = data['month_traffic'] / data['total_month_traffic']
data['last_month_traffic_ratio'] = data['last_month_traffic'] / data['total_month_traffic']

data['1_total_fee_call_fee'] = data['1_total_fee'] - data['service1_caller_time'] * 0.15
data['1_total_fee_call2_fee'] = data['1_total_fee'] - data['service2_caller_time'] * 0.15
data['1_total_fee_trfc_fee'] = data['1_total_fee'] - (data['month_traffic'] - 2 * data['last_month_traffic']) * 0.3

data.loc[data.service_type == 1, '1_total_fee_trfc_fee'] = None

# merge features
# category features
cate_feature = origin_cate_feature
# number features
num_feature = origin_num_feature + count_feature_list + diff_feature_list + w2v_features
# category feature astype category（For LightGBM）
for i in cate_feature:
    data[i] = data[i].astype('category')
# number features astype float
for i in num_feature:
    data[i] = data[i].astype(float)
    
# final features (name list)
feature = cate_feature + num_feature

# features numbers and name
print("##final features length##: "+str(len(feature)), '\n','##final features names##: ',feature)

data = data[data.label != 999999]

# features(X) and label(y) for training and testing
X = data[(data.label != 0) & (data.label != 999999)][feature].reset_index(drop=True)
y = data[(data.label != 0) & (data.label != 999999)].label.reset_index(drop=True)

In [None]:
label2current_service = dict(
    zip(range(0, len(set(y))), sorted(list(set(y)))))
current_service2label = dict(
    zip(sorted(list(set(y))), range(0, len(set(y)))))
current_service2label

# Training and testing with Lightgbm (StratifiedKFold)

In [None]:
# StratifiedKFold + CrossValidation
from sklearn.model_selection import StratifiedKFold
cv_pred = []
skf = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
for index, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(index)
    lgb_model = lgb.LGBMClassifier(
        boosting_type="gbdt", num_leaves=10,
        max_depth=8, n_estimators=2, objective='multiclass', class_weight='balanced',
        subsample=0.65, colsample_bytree=0.65, subsample_freq=1,
        learning_rate=0.05, random_state=2020 + index, n_jobs=-1, metric="None", importance_type='gain'
    )
    train_x, test_x, train_y, test_y = X.loc[train_index], X.loc[test_index], y.loc[train_index], y.loc[test_index]
    
    # only train by those service_type == 4 this time
    train_x = train_x[train_x.service_type == 4]
    train_y = train_y[(train_x.service_type == 4).index]
    test_x = test_x[test_x.service_type == 4]
    test_y = test_y[(test_x.service_type == 4).index]
    print(test_y.unique())

    #eval_set = [(test_x, test_y)]
    lgb_model.fit(train_x, train_y, categorical_feature=cate_feature)
    
    # predict on the same dataset:y_test
    y_test = lgb_model.predict(data[(data.label == 0) & (data.service_type != 1)][feature])
    y_test = pd.Series(y_test).map(current_service2label)

    
    #save prediction to cv_pred (5 times 5 cols)
    if index == 0:
        cv_pred = np.array(y_test).reshape(-1, 1)
    else:
        cv_pred = np.hstack((cv_pred, np.array(y_test).reshape(-1, 1)))
    #print(cv_pred)

In [None]:
#Testing 
f1_score(y_true=test_y, y_pred=lgb_model.predict(test_x), average='macro')

In [None]:
# forecast

submit = []
for line in cv_pred:
#     print(line)
#     print(np.bincount(line))
#     print(np.argmax(np.bincount(line)))
    submit.append(np.argmax(np.bincount(line))) #select highest frequency prediction from 5 
#print(len(submit))    
result = pd.DataFrame()
result['user_id'] = data[(data.label == 0) & (data.service_type != 1)]['user_id']
result['predict'] = submit
result['predict'] = result['predict'].map(label2current_service)
result.loc[result['user_id'] == '4VNcD6kE0sjnAvFX', 'predict'] = 999999

#print(len(result), result.predict.value_counts())
print(result.sort_values('user_id'))