In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

path = "C:/Data/chinaUnicom/"
train_data = pd.read_csv(path + "train/train_.csv")
test_data = pd.read_csv(path + "test/test_.csv")

In [2]:
# 特征处理
def increaseFeatures(train):
    # 费用
    train['avg_total_fee']=(train['1_total_fee']+train['2_total_fee']+train['3_total_fee']+train['3_total_fee']).astype(float)/4.0
    train['sum_total_fee'] = (train['1_total_fee']+train['2_total_fee']+train['3_total_fee']+train['3_total_fee']).astype(float)
    train['pay_num_per'] = train['pay_num'].astype(float)/train['pay_times']
    train['contract_timex_feex_over_bill'] = (train['avg_total_fee']*train['contract_time']*train['many_over_bill']).astype(float)
    train['contract_online_time_rate'] = (train['contract_time']/train['online_time']).astype(float)
    # 流量
    train['sum_traffic_month'] = train['month_traffic'] + train['local_trafffic_month']+1
    train['out_local_traffic_rate'] = train['month_traffic'].astype(float)/train['sum_traffic_month']
    train['local_sum_traffic_rate'] = train['local_trafffic_month'].astype(float)/train['sum_traffic_month']
    train['last_month_sum_traffic_rate'] = train['last_month_traffic'].astype(float)/train['sum_traffic_month']
    # 通话时间
    train['sum_caller_receive_time'] = train['local_caller_time']+train['service1_caller_time']+train['service2_caller_time']+1
    train['sum_caller_time'] = train['local_caller_time']+train['service1_caller_time']
    train['caller_receive_time_rate'] = train['sum_caller_time'].astype(float)/(train['sum_caller_receive_time'])
    # 流量+通话时间+费用
    train['month_traffic_total_fee_rate'] = train['sum_traffic_month'].astype(float)/train['sum_caller_receive_time']
    train['month_traffic_caller_rate'] = train['sum_traffic_month'].astype(float)/(train['1_total_fee']+1)
    train['caller_time_total_fee_rate'] = train['sum_caller_receive_time'].astype(float)/(train['1_total_fee']+1)
    #省内、本地流量通话
    train['local_trafficx_local_caller'] = train['local_trafffic_month']*train['local_caller_time'].astype(float)
    train['month_trafficx_service1_call'] = train['month_traffic']*train['service1_caller_time'].astype(float)
    
    return train

train_data = increaseFeatures(train_data)
test_data = increaseFeatures(test_data)

In [3]:
#label 处理函数 转为0--14序列或者One_Hot序列
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

def service2label(current_service,One_hot = False):
    le = LabelEncoder()
    tmp = sorted(current_service.unique())
    le.fit(tmp)
    label = le.transform(current_service)
    ohe_tmp = le.transform(tmp)
    if(One_hot==True):
        ohe = OneHotEncoder()
        ohe.fit(ohe_tmp.reshape(-1,1))
        label = ohe.transform(label.reshape(-1,1)).toarray()
    return label
# label 处理
current_service_label = train_data['current_service']
label = service2label(current_service_label,One_hot = False)

In [4]:
original_feature = train_data.columns
train_data.dropna(axis=0,how='any')#删除任何带有NaN值得行
train_data.fillna(0)
train = train_data.drop(['current_service','user_id','sum_total_fee','net_service','complaint_level','former_complaint_num','former_complaint_fee'],axis=1)
original_feature_new = train.columns

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_validate,y_train,y_validate = train_test_split(train[original_feature_new],label,test_size=0.3,random_state=100)

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import lightgbm as lgb
import xgboost as xgb
import numpy as np
 
clf1 = KNeighborsClassifier(n_neighbors=5)
clf2 = RandomForestClassifier(n_estimators=500,random_state=200,criterion='gini')
clf3 = xgb.XGBClassifier(max_depth=30, min_child_weight=1, n_estimators=300,n_jobs=-1 ,verbose=1,learning_rate=0.025)
clf4 = GradientBoostingClassifier(n_estimators=500,learning_rate=0.1,random_state=10,subsample=0.8)
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3,clf4],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=lr)
 
print('4-fold cross validation:\n')
 
for clf, label in zip([clf1, clf2, clf3,clf4, sclf],['KNN', 'Random Forest','XGB','GDBT','StackingClassifier']):
    scores = model_selection.cross_val_score(clf, X_train, y_train,cv=4, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]"  % (scores.mean(), scores.std(), label))

In [None]:
labeldic = {}
tmp = sorted(train_data['current_service'].unique())
for i in range(15):
    labeldic[i] = tmp[i]
for j in range(len(pred)):
    pred[j]=labeldic[pred[j]]

In [None]:
import csv

with open(path+"stacking_submission.csv","w",newline='',encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["user_id","current_service"])
    for i in range(len(test_data)):
        writer.writerow([test_data['user_id'][i],pred[i]])