# Imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
# print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
%matplotlib inline
# import pandas as pd
# import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.pylabtools import figsize

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

# Data Cleaning and Formatting

## Loading Data

In [None]:
data = pd.read_csv("../input/train_V2.csv")

In [None]:
data.head()

In [None]:
data.columns

## Data Types and Missing Values

### Missing Values

In [None]:
missing = {}
for column in data.columns:
    if data[column].isnull().sum()>0:
        missing['column'] = column
        missing['missing_values_count'] = data[column].isnull().sum()
        missing['percentage'] = data[column].isnull().sum()/len(data)*100
missing_df = pd.DataFrame(missing, index=[0])
missing_df

In [None]:
data[data['winPlacePerc'].isnull()]

In [None]:
data.dropna(inplace=True)

### Data Types

In [None]:
data.info()

In [None]:
# add features to quant list if they are int or float type
quant = [f for f in data.columns if data.dtypes[f] != 'object']
# add features to qualitative list if they are object type
qual = [f for f in data.columns if data.dtypes[f] == 'object']

In [None]:
quant

In [None]:
qual

### Getting unique values in all columns

In [None]:
for column in data.columns:
    print(column)
    print(len(data[column].unique()))
#     print(data[column].unique())

In [None]:
# qual.remove(['Id', 'groupId', 'matchId'])
qual = list(set(qual).difference(set(['Id', 'groupId', 'matchId'])))

# Exploratory Data Analysis

In [None]:
target = data['winPlacePerc']
target_col = 'winPlacePerc'
quant.remove('winPlacePerc')

#### Check Target distribution

In [None]:
figsize(8,8)

plt.hist(target, bins = 100, edgecolor = 'k');
plt.xlabel('Win Percentage'); plt.ylabel('Count'); 
plt.title('Win Percentage Distribution');

## Categorical Encoding

In [None]:
def encode(frame, feature):
    ordering = pd.DataFrame()
    # extracting unique values from a feature(column)
    ordering['val'] = frame[feature].unique()
    # assigning the unique values to the index of the dataframe
    ordering.index = ordering.val
    # creating a column ordering with values assinged from 1 to the number of unique values
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    # creating a dict with the unique values as keys and the corresponding 
    # numbers in the ordering column as values
    ordering = ordering['ordering'].to_dict()
    # adding the encoded values into the original dataframe within new columns for each feature 
    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature+'_E'] = o
    
qual_encoded = []
# encoding all the features in the qualitative list
for q in qual:  
    encode(data, q)
    qual_encoded.append(q+'_E')
qual_encoded

In [None]:
features = quant + qual_encoded 
train_data = data[features]#.drop(target_col, axis=1)

In [None]:
train_data.columns

## Feature Correlation

In [None]:
def feat_correlation(frame, features, target_col):
    corr = pd.DataFrame()
    corr['feature'] = features
    corr['target'] = [frame[f].corr(frame[target_col], 'spearman') for f in features]
    corr = corr.sort_values('target')
    print(corr)
    plt.figure(figsize=(6, 0.25*len(features)))
    sns.barplot(data=corr, y='feature', x='target', orient='h')
    return corr
    
corr = feat_correlation(data, features, 'winPlacePerc')

In [None]:
corr[corr.target > 0.2].feature

In [None]:
def corr_df(x, corr_val):
    '''
    Obj: Drops features that are strongly correlated to other features.
          This lowers model complexity, and aids in generalizing the model.
    Inputs:
          df: features df (x)
          corr_val: Columns are dropped relative to the corr_val input (e.g. 0.8)
    Output: df that only includes uncorrelated features
    '''

    # Creates Correlation Matrix and Instantiates
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterates through Correlation Matrix Table to find correlated columns
    for i in iters:
        for j in range(i):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index            
            val = item.values
            if val >= corr_val:
                # Prints the correlated feature set and the corr val
                print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(i)

    drops = sorted(set(drop_cols))[::-1]
    
    # Drops the correlated columns
    for i in drops:
        col = x.iloc[:, (i+1):(i+2)].columns.values
        df = x.drop(col, axis=1)

    return df

In [None]:
# Remove the collinear features above a specified correlation coefficient
features = corr_df(train_data, 0.6);

We want to remove features that are highly correlated to each other and keep features that are highly correlated to the target feature.

## Feature Importance

In [None]:
# modelxgb = XGBClassifier()
# modelxgb.fit(train_data[features], target)

# print(modelxgb.feature_importances_)

In [None]:
# from xgboost import plot_importance
# plot_importance(modelxgb)

In [None]:
# f_xgb = pd.DataFrame(data={'feature':features.columns,'value':modelxgb.feature_importances_})
# f_xgb = f_xgb.sort_values(['value'],ascending=False )
# plt.figure(figsize=(15,8))
# sns.barplot(f_xgb['feature'],f_xgb['value'])

In [None]:
# etcmodel = ExtraTreesClassifier()
# etcmodel.fit(features,target)
# print(etcmodel.feature_importances_)

In [None]:
# f_etc = pd.DataFrame(data={'feature':features.columns,'value':etcmodel.feature_importances_})
# f_etc = f_etc.sort_values(['value'],ascending=False )
# plt.figure(figsize=(15,8))
# sns.barplot(f_etc['feature'],f_etc['value'])

In [None]:
# ft = pd.merge(f_xgb, f_etc, how='inner', on=["feature"])

In [None]:
# ft.sort_values(["value_x","value_y"],ascending=False, inplace=True)

In [None]:
# top15ft = ft.head(15)
# top15ft

## Removing Collinear Features

In [None]:
# ??

# Splitting Training and Test Sets

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_data[features],\
                                                    target, test_size = 0.2, random_state=42)   

# Establish Baseline

In [None]:
# Function to calculate mean absolute error
def mae(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

In [None]:
baseline_guess = np.median(y_test)

print('The baseline guess is a score of %0.2f' % baseline_guess)
print("Baseline Performance on the test set: MAE = %0.4f" % mae(y_test, baseline_guess))

## Linear Regression

In [None]:
model_lr = LinearRegression(n_jobs=-1)
model_lr.fit(x_train, y_train)
y_lr = model_lr.predict(x_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
mean_squared_error(y_test,y_lr)

In [None]:
r2_score(y_test, y_lr)

In [None]:
figsize(8,8)

plt.hist(y_lr, bins = 100, edgecolor = 'k');
plt.xlabel('Predicted Win Percentage'); plt.ylabel('Count'); 
plt.title('Predicted Win Percentage Distribution');

In [None]:
y_test.describe()

In [None]:
pd.Series(y_lr).describe()

In [None]:
print("Linear Regression Performance on the test set: MAE = %0.4f" % mae(y_test, y_lr))

# Evaluating test data

In [None]:
test_data = pd.read_csv("../input/test_V2.csv")

In [None]:
test_data.head()

In [None]:
# add features to quant list if they are int or float type
test_quant = [f for f in test_data.columns if test_data.dtypes[f] != 'object']
# add features to qualitative list if they are object type
test_qual = [f for f in test_data.columns if test_data.dtypes[f] == 'object']

In [None]:
test_qual = list(set(test_qual).difference(set(['Id', 'groupId', 'matchId'])))

In [None]:
test_qual_encoded = []
# encoding all the features in the qualitative list
for q in test_qual:  
    encode(test_data, q)
    test_qual_encoded.append(q+'_E')
test_qual_encoded

In [None]:
test_features = test_quant + test_qual_encoded

In [None]:
test_features
# 

In [None]:
train_data.columns

In [None]:
model_lr_test = LinearRegression(n_jobs=-1)
model_lr_test.fit(train_data, target)
y_lr_pred = model_lr_test.predict(test_data[test_features])

In [None]:
submission = pd.DataFrame({'Id': test_data['Id'], 'winPlacePerc': list(y_lr_pred)})

In [None]:
y_lr

In [None]:
submission.to_csv("submission.csv",index=False)