In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def BuildFeature(is_train=True):
    
    y = None
    test_idx = None
    
    if is_train: 
        print("Reading train.csv")
        df = pd.read_csv("/kaggle/input/pubg-finish-placement-prediction/train_V2.csv")          
        df = df[df['maxPlace'] > 1]
    else:
        print("Reading test.csv")
        df = pd.read_csv("/kaggle/input/pubg-finish-placement-prediction/test_V2.csv")
        test_idx = df.Id
    
    # Reduce the memory usage
    df = reduce_mem_usage(df)
    
    print("Delete Unuseful Columns")
    target = 'winPlacePerc'
    features = list(df.columns)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchType")  
    
    if is_train: 
        print("Read Labels")
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64)
        features.remove(target)

    print("Read Group mean features")
    agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    if is_train:
        df_out = agg.reset_index()[['matchId','groupId']]
    else:
        df_out = df[['matchId','groupId']]
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])

    print("Read Group max features")
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    print("Read Group min features")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
    print("Read Group size features")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])
    
    print("Read Match mean features")
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    print("Read Match size features")
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)
    X = df_out
    feature_names = list(df_out.columns)
    del df, df_out, agg, agg_rank
    #gc.collect()  
    return X, y, feature_names, test_idx

特征处理

In [None]:
X_Train, Y_Train, train_columns, _ = BuildFeature(is_train=True)
X_test, _, _ , test_idx = BuildFeature(is_train=False)

缺失值处理

In [None]:
total = X_Train.isnull().sum().sort_values(ascending= False)
print(total)

In [None]:
total = X_test.isnull().sum().sort_values(ascending= False)
print(total)

异常值处理

In [None]:
X_Train['traveldistance']=X_Train['walkDistance_min']+X_Train['rideDistance_min']+X_Train['swimDistance_min']
X_Train['killingwithoutmoving']=(X_Train['kills_min']>0) & (X_Train['traveldistance']== 0)
Y_Train = np.delete(Y_Train,X_Train[X_Train['killingwithoutmoving']==True].index,axis = 0)
X_Train.drop(X_Train[X_Train['killingwithoutmoving']==True].index,inplace=True)
X_Train.drop(['killingwithoutmoving'],axis=1,inplace=True)
X_Train.drop(['traveldistance'],axis=1,inplace=True)
Y_Train = np.delete(Y_Train,X_Train[X_Train['walkDistance_max']>10000].index,axis = 0)
X_Train.drop(X_Train[X_Train['walkDistance_max']>10000].index,inplace=True)
Y_Train = np.delete(Y_Train,X_Train[X_Train['swimDistance_max']>2000].index,axis = 0)
X_Train.drop(X_Train[X_Train['swimDistance_max']>2000].index,inplace=True)
Y_Train = np.delete(Y_Train,X_Train[X_Train['rideDistance_max']>20000].index,axis = 0)
X_Train.drop(X_Train[X_Train['rideDistance_max']>20000].index,inplace=True)
Y_Train = np.delete(Y_Train,X_Train[X_Train['longestKill_max']>1000].index,axis = 0)
X_Train.drop(X_Train[X_Train['longestKill_max']>1000].index,inplace=True)
Y_Train = np.delete(Y_Train,X_Train[X_Train['kills_max']>30].index,axis = 0)
X_Train.drop(X_Train[X_Train['kills_max']>30].index,inplace=True)
Y_Train = np.delete(Y_Train,X_Train[X_Train['roadKills_max']>12].index,axis = 0)
X_Train.drop(X_Train[X_Train['roadKills_max']>12].index,inplace=True)
Y_Train = np.delete(Y_Train,X_Train[X_Train['weaponsAcquired_max']>70].index,axis = 0)
X_Train.drop(X_Train[X_Train['weaponsAcquired_max']>70].index,inplace=True)
Y_Train = np.delete(Y_Train,X_Train[X_Train['heals_max']>35].index,axis = 0)
X_Train.drop(X_Train[X_Train['heals_max']>35].index,inplace=True)
X_Train.info()'''

训练网络

In [None]:
from keras.models import Sequential
from keras.layers import Dense
model = Sequential()
model.add(Dense(256, input_dim=X_Train.shape[1] ,kernel_initializer='normal', activation = 'tanh'))
model.add(Dense(128,kernel_initializer='normal', activation = 'tanh'))
model.add(Dense(64,kernel_initializer='normal', activation = 'tanh'))
model.add(Dense(32,kernel_initializer='normal', activation = 'tanh'))
model.add(Dense(1,kernel_initializer='normal', activation = 'relu'))
model.compile(optimizer= 'adam', loss= 'mae',metrics= ['mse'])
model.fit(X_Train, Y_Train, batch_size= 2000, epochs = 15, verbose = 1)
y_pred = model.predict(X_test, batch_size= 10)
result = test_idx.reset_index(level=None, drop=False, name=None, inplace=False)
result['index'] = y_pred
result.set_index(["Id"], inplace=True)
result.rename(columns={'index': 'winPlacePerc'}, inplace=True)
result.to_csv('submission.csv', index= True)

进行预测

In [None]:
'''from sklearn.linear_model import LinearRegression
LR_model = LinearRegression(n_jobs=4, normalize=True)
LR_model.fit(X_Train,Y_Train)
y_pred = LR_model.predict(X_test)
result = test_idx.reset_index(level=None, drop=False, name=None, inplace=False)
result['index'] = y_pred
result.set_index(["Id"], inplace=True)
result.rename(columns={'index': 'winPlacePerc'}, inplace=True)
result.to_csv('submission.csv', index= True)'''