In [None]:
# import libraries

import pandas as pd
import numpy as np
import time

import plotly.express as px
import plotly.graph_objects as go

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn import metrics 
from eli5.sklearn import PermutationImportance #pip install eli5
from eli5 import show_prediction, show_weights

import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

## Time_taken = 10 hours

## Read csv file

In [None]:
missing_values = ["n/a", "na", "--", "?"]
df = pd.read_csv("starcraft_player_data.csv", na_values = missing_values)

In [None]:
len(df)

# Data Exploratory Analysis

In [None]:
df.head()

In [None]:
display(df.dtypes.value_counts())
display(df.info())

In [None]:
df.isnull().sum()

In [None]:
#missing_categories = ['Age', 'HoursPerWeek', 'TotalHours']
missing_df = df.loc[(df['Age'].isnull()) + (df['HoursPerWeek'].isnull()) + (df['TotalHours'].isnull())]
display(missing_df.head())
display(missing_df.shape)

In [None]:
#All the missing values belong to LeaguIndex 8
display(missing_df['LeagueIndex'].unique())
display(missing_df['LeagueIndex'].value_counts())

In [None]:
categories = ['LeagueIndex', 'Age', 'HoursPerWeek', 'TotalHours']
missing_df = missing_df[categories]
missing_professional_df = missing_df[missing_df['LeagueIndex'] == 8]
missing_dimaond_df = missing_df[missing_df['LeagueIndex'] == 5]
print("Missing professional nan counts are")
display(missing_professional_df.isnull().sum())
print("Missing dimaond nan counts are")
display(missing_dimaond_df.isnull().sum())

In [None]:
df['LeagueIndex'] = df['LeagueIndex'].replace([1,2,3,4,5,6,7,8],['Bronze', 'Silver', 'Gold', 'Platinum', 'Diamond', 'Master', 'GrandMaster', 'Professional_leagues'])
display(df.head())

In [None]:
#Visualizing the leagueindex using barplot
leagueindex_frame = df['LeagueIndex'].value_counts().to_frame()
league_index = leagueindex_frame['LeagueIndex'].index
count = leagueindex_frame['LeagueIndex'].to_list()

fig = go.Figure([go.Bar(x=league_index, y=count, text=count,
            textposition='auto',)])
fig.show('notebook')
display(df['LeagueIndex'].value_counts(normalize = True))

## Result (Data Exploratory)
- When reading the data initally without na_values, there was no missing data in the dataframe but later i realized and passed the missing data to idenitfy the missing values in the dataframe.
- The three columns ['Age', 'HoursPerWeek', 'TotalHours'] are some missing values.
- Two categories from the league index namely 'Diamond' and 'Professional_leagues' have missing values, where Professional_leagues category do not have any values in the three missing columns mentioned above.
- All the categories from the league index column are renamed to their respective string value from numerical form.
- Grandmaster and Master have 1.0% and 1.6% of the categories from the league index column.

## Preprocessing 

In [None]:
game_id = df['GameID'] #Storing for future reference
df = df.drop(['GameID'], axis=1)

In [None]:
copy_df = df[df['LeagueIndex'] == 'Diamond'] #Filtering the missing values belonging to the diamond category
#print(copy_df[copy_df['Age'].isnull()].index.tolist())
categories = ['Age','HoursPerWeek', 'TotalHours']

for column in categories:
    missing_indexes = copy_df[copy_df[column].isnull()].index.tolist()
    for index in missing_indexes:
        df[column][index:index+1] =  df[column].mean() #Filling the missing values with mean
#display(df[df['LeagueIndex'] == 'Diamond'][357:360])
print(df.isnull().sum())

In [None]:
df = df.fillna(0) # Filling the Professional_leagues missing values as 0.
print(df.isnull().sum())

In [None]:
#Printing the descriptive statistics for all the categories in the leagueindex column.
df = df.drop(['Age'], axis = 1)
for category in df['LeagueIndex'].unique():
    print("CATEGORY:",category)
    display(df.describe())

## Result (Preprocessing)

- **GameID** column is dropped from the dataframe.
- **Age** column is dropped from the dataframe as it does not make sense in modeling.
- Filtering the rows with respect to **diamond category** from the league index column and replacing the NAN values with the **mean** of the missing columns.
- Filtering the rows with respect to **Professional leagues category** from the league index column with and replacing the missing values with **0**
- After preprocessing for each category in leagueindex column **descriptive statistics** is displayed

**Note:** _We are not removing the records from the professional league category from the league index column even after substantial amount of values are missing because in the professional league category other columns helps us to find whether players from previous leagueindex like master, grandmaster are eligible to move into professional league caetgory._

# Checking for Multicollinearity

In [None]:
# Creating a pearson correlation matrix to check for variables with higher correaltion
plt.figure(figsize=(20,13))

# Generate a mask to onlyshow the bottom triangle
mask = np.triu(np.ones_like(df.corr(), dtype=bool))

# generate heatmap
sns.heatmap(df.corr(), annot=True, mask=mask, vmin=-1, vmax=1)
plt.title('Correlation Coefficient Of Predictors')
plt.show()

In [None]:
def compute_vif(considered_features):
    
    X = df[considered_features]
    # the calculation of variance inflation requires a constant
    X['intercept'] = 1
    
    # create dataframe to store vif values
    vif = pd.DataFrame()
    vif["Variable"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif = vif[vif['Variable']!='intercept']
    return vif

In [None]:
#For features more than threshold value of 0.60 correlation are considered in VIF
considered_features = ['SelectByHotkeys', 'APM', 'ActionLatency', 'NumberOfPACs', 'GapBetweenPACs', 'ComplexUnitsMade', 'ComplexAbilitiesUsed']


# compute VIF 
compute_vif(considered_features).sort_values('VIF', ascending=False)

In [None]:
#Recalculating the VIF again after removing 'APM' from the considered features
considered_features.remove('APM')
compute_vif(considered_features).sort_values('VIF', ascending=False)


In [None]:
df = df.drop('APM', axis = 1) #Dropping APM column to reduce multicollinearity

## Result (Checking for Multicollinearity)
- At first we are analyzing any correlation exist between the variables using the pearson correaltion matrix using the simple corr() function available from pandas and visualize it using a heatmap.
- From our analysis the following columns **['SelectByHotkeys', 'APM', 'ActionLatency', 'NumberOfPACs', 'GapBetweenPACs', 'ComplexUnitsMade', 'ComplexAbilitiesUsed']** have chosen for further analysis (These columns are selected based on threshold value of more than 0.60 correlation, There is no math involved in selecting **0.60** as **threshold**. It's just minimum threshold to take the columns for further analysis).
- Variance Inflation Factor is calculated between the above mentioned columns to check for any multicollinearity. If the VIF is greater than 5, the variable is repsonsible for multicollinearity.
- Two columns ('APM', 'ActionLatency') produced high multicollinearity greater than 5.
- Based on _gaming experience_, **Action per minute** is very **important** to determine if players can be moved up from **grandmaster, master league** to Professional league but since **'APM'** and **'ActionLatency'** have high multicollinearity, **'APM'** having the highest of **8.824126 VIF** .'APM' column is removed again to check our VIF again. Now all the varaibles produce VIF below 5. 'APM' column is dropped from our dataframe. 

## Finding important features

### Model

In [None]:
def Randomforest_classifier(df):
    X = df.drop("LeagueIndex", axis =1) # X_train
    y = df["LeagueIndex"] #y_train
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
    
    # define the model
    model = RandomForestClassifier(n_estimators = 1200, random_state =42)
    # fit the model
    model.fit(X_train, y_train)
    #performing predicitons on the test data set
    y_pred = model.predict(X_test)

    print("ACCURACY OF THE TEST MODEL: ", round(metrics.accuracy_score(y_test, y_pred),2))

    confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

    print(metrics.classification_report(y_test, y_pred))
    
    _classes = np.unique(y).size

    print(confusion_matrix)
    return model, X_train, y_train, X_test, y_test

### Important Features

In [None]:
def feature_selection(rf, X_train, y_train, X_test, y_test):
    #Applying permutation test to visualize the important features from train and test data
    start_time = time.time()
    
    print("CALCULATING THE TRAIN FEATURE SCORES")
    
    cv = StratifiedKFold(3, shuffle=True, random_state=0)
    
    perm_train = PermutationImportance(rf, refit = False, scoring="balanced_accuracy", n_iter = 50).fit(X_train, y_train)
    #show_weights(perm, feature_names = X_train.columns.tolist())
    df_fi_train = pd.DataFrame(dict(feature_names=X_train.columns.tolist(),
                          feat_imp=perm_train.feature_importances_, 
                          std=perm_train.feature_importances_std_,
                          ))
    
    df_fi_train = df_fi_train.round(4)
    df_fi_train = df_fi_train.sort_values('feat_imp', ascending=False)
    
    perm_test = PermutationImportance(rf, refit = False, scoring="balanced_accuracy", n_iter = 50).fit(X_test, y_test)
    #show_weights(perm, feature_names = X_train.columns.tolist())
    df_fi_test = pd.DataFrame(dict(feature_names=X_train.columns.tolist(),
                          feat_imp=perm_test.feature_importances_, 
                          std=perm_test.feature_importances_std_,
                          ))
    df_fi_test = df_fi_test.round(4)
    df_fi_test = df_fi_test.sort_values('feat_imp', ascending=False)
    end_time = time.time()
    print("Execution time:",start_time-end_time)
    #display(df_fi)
    
    
    fig1 = px.bar(df_fi_train, x='feature_names', y='feat_imp', title='Training_data')
    fig1.show('notebook')
    
    fig2 = px.bar(df_fi_test, x='feature_names', y='feat_imp', title='Testing_data')
    fig2.show('notebook')

### Baseline Model

In [None]:
Randomforest_classifier(df) #Creating a baseline model

#### Note:
- No important feature will be found by using the baseline model because the feature importance is variable for all the league indexes
- The random forest baseline model was not good enough in predicting the different levels of league index. Since the feature importance is variable (not static) for every league index pairs (For ex: Feature importance in Bronze-Silver, Silver-Gold compared to Master-GrandMaster, Master-Professional_leagues might be different). In this case we can split the data based upon ranked league index pairs. Because a player cannot jump from Bronze to Platinum. The player has to jump from Bronze to Silver, Silver to Gold and Gold to Platinum. This gives us ideally three pairs and players from each league index can be compared with next ranking to distinguish their abilties from the higher rank. Simultaneously we can also check by comparing bronze to gold, silver to platinum to show the differences better.Moreover extreme cases like bronze-professional_leagues can be comapred to show the extreme differences. There will be multiple classsifiers made in the next step becasue of the league index pairs.
- Moreover the grandmaster category is really low compared to other categories and the future classifier using this category must be looked into

### LeagueIndex pairs

In [None]:
df_pairs = [] #The list stores the combination of two categories from the dataframe
#Mutiple column Pairs are created in the following order
leagueindex_categories = ['Bronze', 'Silver', 'Gold', 'Platinum', 'Diamond', 'Master', 'GrandMaster', 'Professional_leagues']
print(leagueindex_categories)
for index, category in enumerate(leagueindex_categories):
    df_pairs.append(df[(df['LeagueIndex'] == category) + (df['LeagueIndex'] == leagueindex_categories[index+1])])
    if index == len(leagueindex_categories) - 2:
        break
df_pairs[3]

### Bronze - Silver Classification

In [None]:
rf, X_train, y_train, X_test, y_test = Randomforest_classifier(df_pairs[0])
feature_selection(rf, X_train, y_train, X_test, y_test)

### Silver - Gold Classification

In [None]:
rf, X_train, y_train, X_test, y_test = Randomforest_classifier(df_pairs[1])
feature_selection(rf, X_train, y_train, X_test, y_test)

### Gold - Platinum Classification

In [None]:
rf, X_train, y_train, X_test, y_test = Randomforest_classifier(df_pairs[2])
feature_selection(rf, X_train, y_train, X_test, y_test)

### Platinum - Diamond Classification

In [None]:
rf, X_train, y_train, X_test, y_test = Randomforest_classifier(df_pairs[3])
feature_selection(rf, X_train, y_train, X_test, y_test)

### Diamond - Master Classification

In [None]:
rf, X_train, y_train, X_test, y_test = Randomforest_classifier(df_pairs[4])
feature_selection(rf, X_train, y_train, X_test, y_test)

### Master - GrandMaster Classification

In [None]:
rf, X_train, y_train, X_test, y_test = Randomforest_classifier(df_pairs[5])
feature_selection(rf, X_train, y_train, X_test, y_test)

### GrandMaster - ProfessionalLeague Classification

In [None]:
temp = df_pairs[6].drop
rf, X_train, y_train, X_test, y_test = Randomforest_classifier(df_pairs[6])
feature_selection(rf, X_train, y_train, X_test, y_test)

### Bronze - Gold Classification

In [None]:
temp = df[(df['LeagueIndex'] == "Bronze") + (df['LeagueIndex'] == "Gold")]

rf, X_train, y_train, X_test, y_test = Randomforest_classifier(temp)
feature_selection(rf, X_train, y_train, X_test, y_test)

### Silver - Platinum ClassificationPlatinum

In [None]:
temp = df[(df['LeagueIndex'] == "Silver") + (df['LeagueIndex'] == "Platinum")]

rf, X_train, y_train, X_test, y_test = Randomforest_classifier(temp)
feature_selection(rf, X_train, y_train, X_test, y_test)

### Gold - Diamond Classification

In [None]:
temp = df[(df['LeagueIndex'] == "Gold") + (df['LeagueIndex'] == "Diamond")]

rf, X_train, y_train, X_test, y_test = Randomforest_classifier(temp)
feature_selection(rf, X_train, y_train, X_test, y_test)

### Platinum - Master ClassificationPlatinum

In [None]:
temp = df[(df['LeagueIndex'] == "Platinum") + (df['LeagueIndex'] == "Master")]

rf, X_train, y_train, X_test, y_test = Randomforest_classifier(temp)
feature_selection(rf, X_train, y_train, X_test, y_test)

### Diamond - Professional_Leagues Classification

In [None]:
temp = df[(df['LeagueIndex'] == "Diamond") + (df['LeagueIndex'] == "Professional_leagues")]
temp = temp.drop(['TotalHours', 'HoursPerWeek'], axis =1)
rf, X_train, y_train, X_test, y_test = Randomforest_classifier(temp)
feature_selection(rf, X_train, y_train, X_test, y_test)

### Master - Professional_Leagues Classification

In [None]:
temp = df[(df['LeagueIndex'] == "Master") + (df['LeagueIndex'] == "Professional_leagues")]
temp = temp.drop(['TotalHours', 'HoursPerWeek'], axis =1)
rf, X_train, y_train, X_test, y_test = Randomforest_classifier(temp)
feature_selection(rf, X_train, y_train, X_test, y_test)

### Bronze - Professional_Leagues Classification

In [None]:
temp = df[(df['LeagueIndex'] == "Bronze") + (df['LeagueIndex'] == "Professional_leagues")]
temp = temp.drop(['TotalHours', 'HoursPerWeek'], axis =1)
rf, X_train, y_train, X_test, y_test = Randomforest_classifier(temp)
feature_selection(rf, X_train, y_train, X_test, y_test)

### Result (Feature Importance)

- By analyzing visualizations, for each league index category the following features are considered important,

- **Bronze Level -** By analyzing the Bronze-Silver and Bronze-Gold graph, the top important features are TotalHours, WorkersMade, ActionLatency, NumberofPACs, GapBetweenPACs, SelectByHotKeys and AssignToHotKeys.
- **Silver Level -** By analyzing the Silver-Gold and Silver-Platinum graph, the top important features are TotalHours, ActionLatency, NumberofPACs, GapBetweenPACs, SelectByHotKeys, MinimapAttacks, AssignToHotKeys and ComplexAbilitiesUsed.
- **Gold Level -** By analyzing the Gold-Platinum and Gold-Diamond graph, the top important features are ActionLatency,TotalHours, NumberofPACs, GapBetweenPACs, SelectByHotKeys, MinimapAttacks, AssignToHotKeys, WorkersMade and ComplexAbilitiesUsed.
- **Platinum Level -** By analyzing the Platinum-Diamond and Platinum-Master graph, the top important features are ActionLatency,TotalHours, GapBetweenPACs, NumberofPACs, SelectByHotKeys, AssignToHotKeys, MinimapAttacks and UniqueHotKeys.
- **Diamond Level -** By analyzing the Diamond-Master and Diamond-Professional leagues graph, the top important features are HoursPerWeek, TotalHours, ActionLatency, SelectByHotKeys, AssignToHotKeys, GapBetweenPACs, NumberofPACs and MinimapAttacks and UniqueHotKeys.
- **Master Level -** By analyzing the Master-GrandMaster graph. the top important featues are ComplexAbilitiesUsed, HoursPerWeek, TotalHours, AssignToHotKeys, SelectToHotKeys, NumberofPACs, TotalMapExplored, ActionLatency.
- **GrandMaster Level -** By analyzing the Master-GrandMaster graph. the top important featues are ComplexAbilitiesUsed, HoursPerWeek, TotalHours, AssignToHotKeys, SelectToHotKeys, NumberofPACs, TotalMapExplored, ActionLatency.
- **Professional level -** By analyzing the Diamond-Professional league and Master-Professional graphs, the top most important features are SelectByHotKeys, AssignToHotKeys, UniqueHotkeys, ActionLatency, GapBetweenPACs, ComplexUnitsMade, ComplexAbilitiesused, NumberofPACs, MiniMapAttacks

**Explanation (For General Audience):**
- *For bronze and silver level players they need to play lot of hours to gain expereince and also the workers made in the beginning level is very high compared to the players in the higher leagues like Master and professional. The Higher league players are not using workers by ckicking but instead they use selectbyhotkeys and assignhotkeys a combination of keyboard keys to do the task.Moreover, Action latency is one of the constant important feature throughout all the levels of the player categories. The action latency is the first reponse of the players in the current point of view (POV). The players constantly change the view to check for enemies and completion of buildings. When they move the first response time is called action latency. This feature is really important throughout all the league levels. TotalHours is also important for all league level players. This means players who have more experience will be in higher leagues. All the Master, GrandMaster and professional league players make Complexunits and use ComplexAbilities in their gameplay. Similarly all the higher league players use UnqiueHotKeys during their gameplay. This means players have multiple unique shortcuts to do their task. Perception action cycle is the number of actions done in a span of time. This was moderate throughout all the skill levels.*

## Make players professional (Not able to do it statistically)

In [None]:
important_features = ['game_id','SelectByHotkeys', 'AssignToHotkeys', 'UniqueHotkeys', 'ActionLatency', 'GapBetweenPACs', 'ComplexUnitsMade', 'ComplexAbilitiesUsed', 'NumberOfPACs', 'MinimapAttacks']

In [None]:
df['game_id'] = game_id

In [None]:
professional_df = df[df['LeagueIndex'] == 'Professional_leagues']
grandmaster_df = df[df['LeagueIndex'] == 'GrandMaster']

In [None]:
professional_df = professional_df[important_features].reset_index(drop=True)
grandmaster_df = grandmaster_df[important_features].reset_index(drop=True)

In [None]:
print(professional_df['SelectByHotkeys'].mean() - grandmaster_df['SelectByHotkeys'][0])

In [None]:
import scipy.stats as stats
pop_sample = len(professional_df)
pop_mean = round(professional_df['SelectByHotkeys'].mean(),5)
pop_stdev = round(professional_df['SelectByHotkeys'].std(),5)
pop_sem = pop_stdev / np.sqrt(pop_sample)
n_sample = len(grandmaster_df)
x_bar = round(grandmaster_df['SelectByHotkeys'].mean(),5)
stdev = grandmaster_df['SelectByHotkeys'].std()
sample_sem = stdev / np.sqrt(n_sample)

In [None]:
print(pop_stdev, stdev)

In [None]:
statistic, pvalue = stats.ttest_1samp(professional_df, grandmaster_df['SelectByHotkeys'][0])

**Note**: I was trying to find the p-value for the two columns (eg: professional_league['SelectByHotkeys'] and grandmaster_df['SelectByHotkeys']) but this test was calculating the statistical significance between in terms of column wise. I was able to reject the null hypothesis or accept the null hypothesis for the important features in grandmaster column. The problem is, I was not able to statistically recommend a player belongs to the professional league population. It's 1:n test. I really tried hard to find all possible ways to recommend players statisitcally significant but i do now know how to put it together. It will be great if you can able to share me the answer for this aprt of the question. I did my best to solve it.I'm sorry. 

## Creating fully-fledged model, what model and why?

The random forest model implemeted above have biases and other classification models exhibit few assumptions that will create a problem to validate our model(black box). In this case any classification model which can be interpretable to evaluate the output in form of statistical significance test should be implemented. There are few tree models which come with statistical significane test which are agnostic of any assumptions and rather test the models randomness (eg:  Conditional Inference Forest Model).