In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import matplotlib.pyplot as plt

In [4]:
import seaborn as sns

In [5]:
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.spatial.distance import cdist
import operator

In [6]:
from smart_open import smart_open
import os
from configparser import ConfigParser

config = ConfigParser()

config_file = os.path.join(os.path.dirname('__file__'), 'config.ini')

config.read(config_file)
default = config['aws.data']
aws_key = default['accessKey']
aws_secret = default['secretAccessKey']

bucket_name = 'pubg-dataset-files'
object_key = 'final_train.csv'

path = 's3://{}:{}@{}/{}'.format(aws_key, aws_secret, bucket_name, object_key)

df_train = pd.read_csv(smart_open(path))

In [7]:
df_train.head()


Unnamed: 0.1,Unnamed: 0,totalDistance,weaponsAcquired,healsAndBoosts,longestKill,killsNorm,assists,DBNOs,headshotKills,revives,vehicleDestroys,winPlacePerc,killPlace,numGroups
0,0,244.8,1,0.0,0.0,0.0,0,0,0,0,0,0.4444,60,26
1,1,1445.0445,5,0.0,0.0,0.0,0,0,0,0,0,0.64,57,25
2,2,161.8,2,0.0,0.0,0.0,1,0,0,0,0,0.7755,47,47
3,3,202.7,3,0.0,0.0,0.0,0,0,0,0,0,0.1667,75,30
4,4,49.75,2,0.0,58.53,1.03,0,0,0,0,0,0.1875,45,95


In [103]:
df_train=df_train.drop('Unnamed: 0',axis=1)

In [104]:
df_train.head()

Unnamed: 0,totalDistance,weaponsAcquired,healsAndBoosts,longestKill,killsNorm,assists,DBNOs,headshotKills,revives,vehicleDestroys,winPlacePerc,killPlace,numGroups
0,244.8,1,0.0,0.0,0.0,0,0,0,0,0,0.4444,60,26
1,1445.0445,5,0.0,0.0,0.0,0,0,0,0,0,0.64,57,25
2,161.8,2,0.0,0.0,0.0,1,0,0,0,0,0.7755,47,47
3,202.7,3,0.0,0.0,0.0,0,0,0,0,0,0.1667,75,30
4,49.75,2,0.0,58.53,1.03,0,0,0,0,0,0.1875,45,95


In [105]:
X=df_train.drop('winPlacePerc',axis=1)

In [106]:
y=df_train.winPlacePerc

In [107]:
df_train.columns

Index(['totalDistance', 'weaponsAcquired', 'healsAndBoosts', 'longestKill',
       'killsNorm', 'assists', 'DBNOs', 'headshotKills', 'revives',
       'vehicleDestroys', 'winPlacePerc', 'killPlace', 'numGroups'],
      dtype='object')

Let's split the df_train into test and train

In [108]:
from sklearn.model_selection import train_test_split

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Now let's perform different Machine Algorithms

# 1. Linear Regression 

In [110]:
from sklearn import linear_model
from sklearn.metrics import *
lm=LinearRegression()

In [111]:
lm.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [112]:
def calc_error_metric(modelname, model, X_train_scale, y_train, X_test_scale, y_test):
    global error_metric
    y_train_predicted = model.predict(X_train_scale)
    y_test_predicted = model.predict(X_test_scale)
        
    #MAE, RMS, MAPE, R2
    
    r2_train = r2_score(y_train, y_train_predicted)
    r2_test = r2_score(y_test, y_test_predicted)
    
    rms_train = sqrt(mean_squared_error(y_train, y_train_predicted))
    rms_test = sqrt(mean_squared_error(y_test, y_test_predicted))
        
    mae_train = mean_absolute_error(y_train, y_train_predicted)
    mae_test = mean_absolute_error(y_test, y_test_predicted)
        
#     mape_train = np.mean(np.abs((y_train - y_train_predicted) / y_train)) * 100
#     mape_test = np.mean(np.abs((y_test - y_test_predicted) / y_test)) * 100
        
    rmse_dict[modelname] = rms_test
        
    df_local = pd.DataFrame({'Model':[modelname],
                            'r2_train': [r2_train],
                            'r2_test': [r2_test],
                            'rms_train':[rms_train], 
                            'rms_test': [rms_test],
                            'mae_train': [mae_train],
                            'mae_test': [mae_test]})
        
    error_metric = pd.concat([error_metric, df_local])
    return error_metric

In [113]:
error_metric = pd.DataFrame({'r2_train': [],
                            'r2_test': [],
                             'rms_train':[], 
                            'rms_test': [],
                            'mae_train': [],
                            'mae_test':[]})
    
rmse_dict = {} 

In [114]:
from math import sqrt

In [115]:
def mean_absolute_percentage_error(y_true, y_pred): 
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
y_train_pred=lm.predict(X_train)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
calc_error_metric('Linear Regression',lm, X_train, y_train, X_test, y_test)


R2   : 0.720960897216268
MAE  : 0.12343537395485776
RMSE : 0.16237253389112408


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




Unnamed: 0,Model,mae_test,mae_train,r2_test,r2_train,rms_test,rms_train
0,Linear Regression,0.12335,0.123435,0.721562,0.720961,0.162195,0.162373


# 2. Random Forest Regression

In [116]:

# model = RandomForestRegressor(n_estimators=300, max_features = 11)
#     # create the RFE model and select 3 attributes
# rfe = RFE(model)
# rfe = rfe.fit(X_train, y_train)
#     # summarize the selection of the attributes
# print(rfe.support_)
# print(rfe.ranking_)
# print(rfe.n_features_)
#     #Check the accuracy of the model
# rfe.score(X_train, y_train)

In [117]:
# # Regression
# clf = LinearRegression()
# clf.fit(X_train, y_train)
# calc_error_metric('Regression', clf, X_train, y_train, X_test, y_test)
# print('Regression completed')

rf = RandomForestRegressor(n_estimators=30, max_depth=7)
rf.fit(X_train, y_train)
calc_error_metric('RandomForest', rf, X_train, y_train, X_test, y_test)
# print('RandomForest completed')
    

    

    


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




Unnamed: 0,Model,mae_test,mae_train,r2_test,r2_train,rms_test,rms_train
0,Linear Regression,0.12335,0.123435,0.721562,0.720961,0.162195,0.162373
0,RandomForest,0.080432,0.080362,0.865387,0.865475,0.112776,0.112741


## 3. Neural network

In [118]:
nn = MLPRegressor()
nn.fit(X_train, y_train)
calc_error_metric('Nueral Network', nn, X_train, y_train, X_test, y_test)
#print('Neural Network completed')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




Unnamed: 0,Model,mae_test,mae_train,r2_test,r2_train,rms_test,rms_train
0,Linear Regression,0.12335,0.123435,0.721562,0.720961,0.162195,0.162373
0,RandomForest,0.080432,0.080362,0.865387,0.865475,0.112776,0.112741
0,Nueral Network,0.080515,0.080524,0.872683,0.872523,0.109677,0.109748


## Calculate best model

In [119]:
best_model =  min(rmse_dict.items(),key=operator.itemgetter(1))[0]
print('Best Model is-', best_model)

Best Model is- Nueral Network


## Write the error

In [120]:
error_metric.to_csv('Error_metrics.csv')

In [121]:
# Let's pickle the models
import pickle
model_name = 'best_model.model'
pickle.dump(rf, open(model_name, 'wb'))

In [122]:
pickle.dump(nn, open('neural_network.model', 'wb'))

In [123]:
pickle.dump(lm, open('linear_regression.model', 'wb'))

In [124]:
pickle.dump(rf, open('random_forest.model', 'wb'))

In [125]:
# load the model from disk
loaded_model = pickle.load(open('random_forest.model', 'rb'))

In [126]:
result = loaded_model.score(X_test, y_test)
print(result)

0.865386906226441


In [127]:
X_test_one=X_test.values[10]

In [128]:
X_test_one

array([641.4 ,   2.  ,   0.  ,  75.76,   2.12,   0.  ,   1.  ,   0.  ,
         0.  ,   0.  ,  15.  ,  28.  ])

In [129]:
predict=loaded_model.predict([X_test_one])

In [130]:
predict

array([0.40136439])

In [131]:
df_train.head()

Unnamed: 0,totalDistance,weaponsAcquired,healsAndBoosts,longestKill,killsNorm,assists,DBNOs,headshotKills,revives,vehicleDestroys,winPlacePerc,killPlace,numGroups
0,244.8,1,0.0,0.0,0.0,0,0,0,0,0,0.4444,60,26
1,1445.0445,5,0.0,0.0,0.0,0,0,0,0,0,0.64,57,25
2,161.8,2,0.0,0.0,0.0,1,0,0,0,0,0.7755,47,47
3,202.7,3,0.0,0.0,0.0,0,0,0,0,0,0.1667,75,30
4,49.75,2,0.0,58.53,1.03,0,0,0,0,0,0.1875,45,95


In [138]:
trail=[244.80,1,0,0,0,0,0,0,0,0,60,26]

In [139]:
predict=loaded_model.predict([trail])

In [140]:
print(predict)

[0.45656737]
