# What is in this kernel ?

- simple edta     
- Identifying hacks and removing them     
- Feature engineering     
- Baseline models    

### Loading libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.stats import norm
import warnings
warnings.filterwarnings(action='ignore')
plt.style.use('fivethirtyeight')
import featuretools as ft
import dask.dataframe as dd
import gc
gc.enable()

#### Importing the dataset

###### Using dask



With Dask and its dataframe construct, you set up the dataframe must like you would in pandas but rather than loading the data into pandas, this approach keeps the dataframe as a sort of ‘pointer’ to the data file and doesn’t load anything until you specifically tell it to do so.

Source: http://pythondata.com/dask-large-csv-python/


In [None]:
%time

train_d=dd.read_csv('../input/train_V2.csv',blocksize=25e6)



###### Using pandas

In [None]:
train=pd.read_csv('../input/train_V2.csv',nrows=100000)
test=pd.read_csv("../input/test_V2.csv",)
test_id=test['Id']

#### Basic description of the data

In [None]:
print('size of the training data',train.shape)

There are 26 columns in our training data.

In [None]:
train.columns

In [None]:
train.head()

In [None]:
train.info()

##### Missing values

In [None]:
train.isnull().sum()

##### Target variable distribution

In [None]:
plt.figure()
sns.distplot(train['winPlacePerc'],fit=norm)
plt.show()

In [None]:
plt.figure()
sns.distplot(train['winPoints'],fit=norm)
plt.show()

In [None]:
train_group=train.groupby('groupId',as_index=False)['winPlacePerc'].agg('mean')

In [None]:
train_group.head()

In [None]:
train[train['groupId']==2]['winPlacePerc']

- From this it is clear that **winPlacePer** which is our target variable is same in each group

In [None]:
train_corr=train.drop(columns=['Id',"matchId","groupId"])


###### Are they correlated ?

In [None]:
plt.figure(figsize=(9,7))

sns.heatmap(train_corr.corr(),linewidths=.1)
plt.show()
plt.savefig('corr')

###### Why killplace always shows a negative correlation ?

In [None]:
plt.figure()
sns.distplot(train['killPlace'],bins=50)
plt.show()

###### MatchType and duration

In [None]:
plt.figure(figsize=(9,7))
match_dur=train.groupby('matchType')['matchDuration'].agg('mean')
sns.barplot(x=match_dur.index,y=match_dur)
plt.gca().set_xticklabels(match_dur.index,rotation='45')
plt.gca().set_title('mean match-type duration')
plt.show()
plt.savefig('duration')

The **Killplace** feature shows negative correlation between other features because      
killplace is the Ranking in match of number of enemy players killed.

In [None]:
plt.figure()
sns.regplot(train['kills'].values,train['damageDealt'].values)
plt.gca().set_ylabel('Damage dealt')
plt.gca().set_xlabel('Total kills')
plt.show()


## Identifying cheaters

- ######  Types of cheaters

<img src="https://cdn.mos.cms.futurecdn.net/36pdCgyXDgKmbqSpxnJ6Ue-650-80.png">

When it comes to PUBG cheats there are two types of hacks that are pretty common, aim hacks and speed hacks. Both are pretty self explanatory, one makes you speed round the map at a rapid pace, while the other will automatically make sure that your bullets always head towards an opponent. Then there are a number of more subtle cheats that aren’t usually as obvious.

###### PUBG aimbots and hacks

<img src="https://cdn.mos.cms.futurecdn.net/pWwhZzoz8VSA3aYcnapvUo-650-80.jpg">

PUBG aimbots and hacks are probably the most annoying things out there. They will take control of a players aim and automatically target it towards opponents. This can be abused in multiple ways. The most obvious is that every bullet they fire that has a line of sight towards another player will hit, and is the explanation to why other players seem to be able to hit ridiculous cross map shots. 

In [None]:
print("Avergae no of enemy players knocked per game is",train['DBNOs'].sum()/len(train))

In [None]:
train_dbno=pd.DataFrame(train['DBNOs'].value_counts(),columns=['DBNOs'])
dbno=train_dbno.iloc[:9,:]
dbno.iloc[8]['DBNOs']=train_dbno.iloc[8:,:].sum()['DBNOs']


In [None]:
plt.figure(figsize=(9,7))
sns.barplot(dbno.index,dbno.DBNOs)
plt.gca().set_xticklabels([0,1,2,3,4,5,6,7,'8+'])
plt.gca().set_xlabel('No of enemy players knocked')
plt.gca().set_ylabel("count")
plt.show()
plt.savefig("enemy_")

In [None]:
print("World record for the most number of DBNOs in a single game is",train_dbno.shape[0])

In [None]:
print("A player uses on an avergae {} boost items".format(train['boosts'].sum()/len(train)))

In [None]:
print("A average player kills {} players".format(train['kills'].sum()/len(train)))

In [None]:
print('A player uses on an average {}  heals'.format(train['heals'].sum()/len(train)))

In [None]:
print('A player uses on an average rides {} '.format(train['rideDistance'].sum()/len(train)))

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2)
fig.set_figwidth(10)
sns.boxplot(train['headshotKills'],ax=ax1)
sns.boxplot(train['kills'],ax=ax2)
plt.show()

In [None]:
set1=set(i for i in train[(train['kills']>40) & (train['heals']==0)].index.tolist())


- They might have high headshot rate with aimhacks

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2)
sns.distplot(train['headshotKills'],ax=ax1)
sns.boxplot(train['headshotKills']/train['kills'],ax=ax2)

- Less movement

In [None]:
train['move']=train['rideDistance']+train['swimDistance']+train['walkDistance']
test['move']=test['rideDistance']+test['swimDistance']+test['walkDistance']
sns.distplot(train['move'])

In [None]:
set2=set(i for i in train[(train['move']==0) & (train['kills']>20) ].index.tolist())

In [None]:

fig,(ax1,ax2)=plt.subplots(1,2)
fig.set_figwidth(15)

sns.distplot(train['damageDealt'],ax=ax1)
sns.boxplot(train['damageDealt'],ax=ax2)
plt.show()

There are outliers in damageDealt.

In [None]:
set3=set(i for i in train[(train['damageDealt']>4000) & (train['heals']<2)].index.tolist())

- ######  PUBG speed Hacks

<img src='https://cdn.mos.cms.futurecdn.net/Vd89dcrLiXGQq4ocje2bTR-650-80.jpg'>

- High ride distance

In [None]:
plt.figure()
sns.boxplot(train['rideDistance'])
plt.show()

- High roadkill rate

In [None]:
roadkills=train['roadKills'].value_counts()[1:]
sns.barplot(x=roadkills.index,y=roadkills)

In [None]:
train['roadkillsrate']=(train['roadKills']/train['kills']).fillna(0)
sns.boxplot(train['roadkillsrate'])

plt.show()

In [None]:
set4=set(i for i in train[(train['rideDistance']>25000) & (train['roadkillsrate']>.4) ].index.tolist())

- ###### Recoil Scripts 

<img src='https://cdn.mos.cms.futurecdn.net/LodXF3A7ZFaHt8qNyevcZc-650-80.jpg'>

In [None]:
killstreak=pd.DataFrame(train['killStreaks'].value_counts())
killstreak.iloc[4]=killstreak.iloc[4:].sum()
killstreak=killstreak[:5]
sns.barplot(killstreak.index,killstreak['killStreaks'])

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2)
fig.set_figwidth(10)
sns.distplot(train['weaponsAcquired'],ax=ax1)
sns.boxplot(train['weaponsAcquired'],ax=ax2)

In [None]:
set5=set(i for i in train[(train['killStreaks']>3) & (train['weaponsAcquired']> 30)].index.tolist())

In [None]:
set6=set(i for i in train[(train['killStreaks']>3) & (train['weaponsAcquired']> 30)].index.tolist())

In [None]:
sets=set1 | set2 | set3 | set4 | set5 | set6

In [None]:
train.drop(train.index[list(sets)],inplace=True)

#### Feature engineering 

In [None]:
train.drop(['roadkillsrate'],axis=1,inplace=True)
train.columns.drop('winPlacePerc')==test.columns

In [None]:
def new_features(df):
    df['items']=df['boosts']+df['heals']
    df['headshotKills_over_kills'] = df['headshotKills'] / df['kills']
    df['headshotKills_over_kills'].fillna(0, inplace=True)
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    df['killPlace_over_maxPlace'].fillna(0, inplace=True)
    df['killPlace_over_maxPlace'].replace(np.inf, 0, inplace=True)
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_over_heals'].fillna(0, inplace=True)
    df['walkDistance_over_heals'].replace(np.inf, 0, inplace=True)
    df['walkDistance_over_kills'] = df['walkDistance'] / df['kills']
    df['walkDistance_over_kills'].fillna(0, inplace=True)
    df['walkDistance_over_kills'].replace(np.inf, 0, inplace=True)
    df['teamwork'] = df['assists'] + df['revives']
    return df

In [None]:
train=new_features(train)
test=new_features(test)

In [None]:
def feature_engineering(is_train=True):
    if is_train: 
        print("processing train.csv")
        df = train
    else:
        print("processing test.csv")
        df = test
    
    target = 'winPlacePerc'
    features = list(df.columns)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove('matchType')
    y = None
    if is_train: 
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64)
        features.remove(target)

    print("get group mean feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    if is_train: df_out = agg.reset_index()[['matchId','groupId']]
    else: df_out = df[['matchId','groupId']]

    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
    
    print("get group max feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    print("get group min feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
    print("get group size feature")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])
    
    print("get match mean feature")
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    print("get match size feature")
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)

    X = np.array(df_out, dtype=np.float64)
    del df, df_out, agg, agg_rank
    gc.collect()

    return X, y
    
x_train, y = feature_engineering(True)



In [None]:
test,z=feature_engineering(False)

###### MinMax scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler(feature_range=(-1,1),copy=False).fit(x_train)
x_train=scaler.transform(x_train)

In [None]:
x_test=scaler.transform(test)

#### Model

In [None]:

from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import cross_val_score,KFold
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso,LinearRegression
from sklearn import preprocessing
from  sklearn.model_selection import RandomizedSearchCV,train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

###### mean absolute error

In [None]:
n_folds=2

def mbs(model):
    kf=KFold(n_folds,shuffle=True,random_state=42).get_n_splits(x_train)
    score=-cross_val_score(model,x_train,y,scoring='neg_mean_absolute_error',cv=kf)
    return score
   



In [None]:
lasso=Lasso(alpha=.005,random_state=1)

In [None]:
#mbs(lasso)

###### XGBoost

In [None]:
params={'learning_rate':[.02,.03,.1],'min_child_weight':[4,6,8],'max_depth':[8,10],"subsample":[.6,.4],"n_estimators":[300,500]}


In [None]:
#xgb_grid=GridSearchCV(xgb,param_grid=params,cv=2,verbose=True,n_jobs=-1)
#xgb_grid.fit(x_train,y)


In [None]:

#xgb_grid.best_params_

In [None]:
xgb=XGBRegressor(learning_rate=.03,min_child_weight=4,max_depth=10,subsample=.4,n_estimators=500,n_jobs=-1)
#mbs(xgb)


###### GradientBoosting

In [None]:
GBoost = GradientBoostingRegressor(n_estimators=600, learning_rate=0.05,
                                   max_depth=4, max_features='auto',
                                   min_samples_leaf=15, min_samples_split=5, 
                                   loss='huber', random_state =5)

##### Stacking

In [None]:
class Average_models(BaseEstimator,RegressorMixin,TransformerMixin):
    def __init__(self,models):
        self.models=models
    def fit(self,X,y):
        self.models_=[clone(x) for x in self.models]
        for model in self.models_:
            model.fit(X,y)
        return self
    def predict(self,X):
            predictions= np.column_stack([model.predict(X) for model in self.models_])
            return np.mean(predictions,axis=1)

In [None]:
#mbs(Average_models(models=(xgb,GBoost)))

In [None]:
#avg=Average_models(models=(xgb,GBoost))
#avg.fit(x_train,y)
#y_test=avg.predict(x_test)

###### Advanced stacking

In [None]:
training,valid,ytraining,yvalid=train_test_split(x_train,y)
xgb.fit(training,ytraining)
lasso.fit(training,ytraining)


pred1=xgb.predict(valid)
pred2=lasso.predict(valid)


test_pred1=xgb.predict(test)
test_pred2=lasso.predict(test)


stacked_predictions=np.column_stack((pred1,pred2))
stacked_test_prediction=np.column_stack((test_pred1,test_pred2))

meta_model=LinearRegression()
meta_model.fit(stacked_predictions,yvalid)
final=meta_model.predict(stacked_test_prediction)


###### Making my submission

In [None]:
df=pd.DataFrame({'id':test_id,'winPlacePerc':final},columns=['id','winPlacePerc'])
csv=df[['id','winPlacePerc']].to_csv('submission_stack3.csv',index=False)

###### If you reached till here,please consider upvoting my kernel.Thank you.