In [None]:
import os
print(os.listdir("../input"))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import copy
# хорошая библиотека, наглядно позволяющая изучать зависимости параметров
import seaborn as sns
# импортируем хорошо известные модели, будем их сравнивать
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler

# Подготовка датасетов

In [None]:
columns_train = ['Id','killStreaks','kills','longestKill','revives','rideDistance','walkDistance','weaponsAcquired','winPlacePerc']
column_types = {'Id':'object','killStreaks': 'uint8','kills': 'uint8','longestKill': 'float32','revives': 'uint8','rideDistance': 'float32','walkDistance': 'float32','weaponsAcquired': 'uint8','winPlacePerc': 'float32'}
#читаем ограниченное количество строк 50000 для скорсти
#df_train = pd.read_csv('train_V2.csv', usecols = columns_train, dtype=column_types,  nrows=50000, index_col = "Id")
df_train = pd.read_csv('../input/train_V2.csv', usecols = columns_train, dtype=column_types ,  nrows=50000,  index_col = "Id")

In [None]:
# Т.к. одни и те-же действия нужно провести со всеми датасетами, лучше манипуляции оформить в процедуру
# и применять ее одной командой
def prepare_dataset(_df, winPlacePerc):
    #чтобы не накапливать изменения в исходном датасете, лучше его склонировать 
    df = copy.deepcopy(_df)
    df.dropna(inplace=True)
    for column in ["longestKill","rideDistance","walkDistance","weaponsAcquired"]:
        df[column] = df[column] / df[column].max()
    if(winPlacePerc):df['winPlacePerc'] = (df['winPlacePerc'] // 0.1).astype(int)
    return df

In [None]:
#о чем и говорил, датасеты готовятся одной командой
#df_reseach = prepare_dataset(df_train, False) #на этом датасете проверяю идеи
df_row_count = round(df_train.shape[0])
df_train_0 = df_train.head(round(df_row_count-df_row_count/10))
df_test_0 =  df_train.tail(round(df_row_count/10))
df_train_0 = prepare_dataset(df_train_0, True)
df_test_0 = prepare_dataset(df_test_0, True)


In [None]:
df_train_0.info()

In [None]:
#разделяем датасеты на параметры и ответы
X_test_0 = df_test_0.drop(['winPlacePerc'],axis=1)
Y_test_0 = df_test_0['winPlacePerc']
X_train_0 = df_train_0.drop(['winPlacePerc'],axis=1)
Y_train_0 = df_train_0['winPlacePerc']

# Обучение всех моделей, поехали!

In [None]:
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_0,Y_train_0)
Y_predict_0 = clf.predict(X_test_0)
acc_log = accuracy_score(Y_test_0,Y_predict_0)
acc_log

# Support Vector Machines
from sklearn.svm import SVC, LinearSVC
svc = SVC()
svc.fit(X_train_0,Y_train_0)
Y_pred = svc.predict(X_test_0)
acc_svc = accuracy_score(Y_test_0,Y_pred)
acc_svc

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train_0,Y_train_0)
Y_pred_knn= knn.predict(X_test_0)
acc_knn = accuracy_score(Y_test_0,Y_pred_knn)
acc_knn

# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train_0,Y_train_0)
Y_pred_gaussian = gaussian.predict(X_test_0)
acc_gaussian = accuracy_score(Y_test_0,Y_pred_gaussian)
acc_gaussian

perceptron = Perceptron()
perceptron.fit(X_train_0, Y_train_0)
Y_pred_perceptron = perceptron.predict(X_test_0)
acc_perceptron = accuracy_score(Y_test_0,Y_pred_perceptron)
acc_perceptron

In [None]:
# Linear SVC
from sklearn.svm import LinearSVC
linear_svc = LinearSVC()
linear_svc.fit(X_train_0, Y_train_0)
Y_pred_linear_svc = linear_svc.predict(X_test_0)
acc_linear_svc = accuracy_score(Y_test_0,Y_pred_linear_svc)
acc_linear_svc

# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train_0, Y_train_0)
Y_pred_sgd = sgd.predict(X_test_0)
acc_sgd = accuracy_score(Y_test_0,Y_pred_sgd)
acc_sgd

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train_0, Y_train_0)
Y_pred_decision_tree = decision_tree.predict(X_test_0)
acc_decision_tree = accuracy_score(Y_test_0,Y_pred_decision_tree)
acc_decision_tree

# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train_0, Y_train_0)
Y_pred_random_forest = random_forest.predict(X_test_0)
acc_random_forest = accuracy_score(Y_test_0,Y_pred_random_forest)
acc_random_forest

# Состязание моделей

После того, как обучены все модели и посчитаны их рейтинги, сведем их в таблицу и выявим чемпиона

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
"""    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
"""              
    'Score': [None, None, None, 
              None, None, None, 
              None, acc_linear_svc, None]})
models

In [None]:
columns_train = ['Id','killStreaks','kills','longestKill','revives','rideDistance','walkDistance','weaponsAcquired']
column_types = {'Id':'object','killStreaks': 'uint8','kills': 'uint8','longestKill': 'float32','revives': 'uint8','rideDistance': 'float32','walkDistance': 'float32','weaponsAcquired': 'uint8'}
df_test = pd.read_csv('../input/test_V2.csv', usecols = columns_train, dtype=column_types ,  index_col = "Id")
X_predict = pd.read_csv('../input/test_V2.csv')['Id']
df_test = prepare_dataset(df_test, False)
Y_predict = linear_svc.predict(df_test)
Y_predict = Y_predict * 0.1
prediction = pd.concat([X_predict, pd.Series(Y_predict)], axis=1, sort=False)
prediction.columns = ['Id','winPlacePerc']
prediction.to_csv('prediction.csv', index = False)
prediction